本文整理汇总了Python中regex.sub函数的典型用法代码示例。如果您正苦于以下问题:Python sub函数的具体用法?Python sub怎么用?Python sub使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sub函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: clean_text
def clean_text(text):
clear_text_regexp = re.compile(r'(?u)\w+|[,.!?]')
text_ = " ".join(clear_text_regexp.findall(text)).replace(" .", ".").replace(" ,", ",")
text_ = re.sub("[,]+", ",", text_)
text_ = re.sub("[.]+", ".", text_)
text_ = re.sub("\s+", " ", text_)
return text_
开发者ID:RenatKhayrullin,项目名称:Diploma,代码行数:7,代码来源:TextCleaner.py
示例2: transform
def transform(self, text):
for pattern, replace in self.pattern_replace_pair_list:
try:
text = regex.sub(pattern, replace, text)
except:
pass
return regex.sub(r"\s+", " ", text).strip()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:7,代码来源:data_processor.py
示例3: parse_implied_depth
def parse_implied_depth(self, element):
ja_depth_pattern = ur"\[(\d)\]$"
ja_sections_pattern = ur"\[(.*)\]$"
title_str = element.get('text').strip()
depth_match = re.search(ja_depth_pattern, title_str)
if depth_match:
depth = int(depth_match.group(1))
placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
element.set('text', re.sub(ja_depth_pattern, "", title_str))
return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}
sections_match = re.search(ja_sections_pattern, title_str)
if sections_match:
sections = [s.strip() for s in sections_match.group(1).split(",")]
element.set('text', re.sub(ja_sections_pattern, "", title_str))
section_names = []
address_types = []
for s in sections:
tpl = s.split(":")
section_names.append(tpl[0])
address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')
return {'section_names': section_names, 'address_types' : address_types}
else:
return None
开发者ID:JonMosenkis,项目名称:Sefaria-Project,代码行数:26,代码来源:parse_index_and_version.py
示例4: test_post
def test_post(title, body, user_name, site, is_answer, body_is_summary):
result = []
for rule in FindSpam.rules:
body_to_check = body
if rule['stripcodeblocks']:
body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
if rule['all'] != (site in rule['sites']):
matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
if matched_title and rule['title']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "title"))
if matched_username and rule['username']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "username"))
if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
type_of_post = "answer" if is_answer else "body"
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
result.append(rule['reason'].replace("{}", type_of_post))
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", type_of_post))
return result
开发者ID:JC3,项目名称:SmokeDetector,代码行数:31,代码来源:findspam.py
示例5: remove_article
def remove_article(self, text):
for art in self.articles:
text = re.sub("^\s*\m%s\M\s*" % art, " ", text)
text = re.sub("\s*\mο\M", "", text)
text = re.sub("\s*\mείναι\M", "", text)
return text.strip()
开发者ID:garykpdx,项目名称:panlex-tools,代码行数:7,代码来源:greek.py
示例6: lcc_range
def lcc_range(string):
"""
Takes a string, returns a tuple of two LCClassNumbers, the start and
end of the range.
"""
string = string.encode("ascii","replace")
string = string.replace("(","")
string = string.replace(")","")
if string.endswith("A-Z"):
# TMI in the schedules when they're alphabetical.
# I don't care.
string.replace("A-Z","")
if "-" not in string:
# A range of self length.
return (LCCallNumber(string), LCCallNumber(string))
parts = string.split("-")
if re.search(r"^\d",parts[1]):
header = re.sub("^([A-Z]+).*",r"\1",parts[0])
elif re.search(r"^\.",parts[1]):
header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
elif re.search(r"^[A-Z]",parts[1]):
header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])
else:
header = " "
parts[1] = header + parts[1]
return (
LCCallNumber(parts[0]),
LCCallNumber(parts[1])
)
开发者ID:Bookworm-project,项目名称:Bookworm-MARC,代码行数:32,代码来源:bookwormMARC.py
示例7: tei_spellcheck
def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
filter_punctuation=False):
"""
Performs a spell check on an TEI XML document.
Each ``seg`` element is treated as a single word and spelling corrections
will be inserted using a choice tag. Correct words will be untouched and
correction candidates will be sorted by edit distance.
Args:
facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
dictionary (unicode): Path to a base dictionary.
deletion_dictionary (unicode): Path to a deletion dictionary.
filter_punctuation (bool): Switch to filter punctuation inside
segments.
Returns:
A TEIFacsimile object containing the spelling corrections.
"""
text_tokens = [x[-1] for x in facsimile.segments]
if filter_punctuation:
text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
for segment in facsimile.segments:
key = alg.sanitize(segment[-1])
if filter_punctuation:
key = regex.sub('[^\w]', '', key)
if key not in suggestions:
continue
for sugg in suggestions[key]:
facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
alg.edit_distance(key, sugg))])
return facsimile
开发者ID:amitdo,项目名称:nidaba,代码行数:34,代码来源:lex.py
示例8: fix_broken_paragraphs
def fix_broken_paragraphs(in_bytes):
out = in_bytes
out = regex.sub(rb'''(?<=\p{lower}\s*)</(blockquote|p|div)>
\s*
<\1[^>]*>\s*(?=\p{lower})''',
b' ',
out, flags=regex.VERBOSE|regex.I)
out = regex.sub(rb'''(?<=\p{lower}\s*)
<p[^>]*>(?=\s*\p{lower})''',
b' ',
out, flags=regex.VERBOSE|regex.I)
# Deal with a wrong paragraph break on a hyphenated word
# (v.ugly)
out = regex.sub(rb'''(?<=\p{lower})-</(blockquote|p|div)>
\s*
<\1[^>]*>\s*(?=\p{lower})''',
b'',
out, flags=regex.VERBOSE|regex.I)
out = regex.sub(rb'(?<=\p{lower})-<p[^>]*>(?=\s*\p{lower})',
b'',
out, regex.I)
return out
开发者ID:jgpacker,项目名称:suttacentral,代码行数:25,代码来源:__init__.py
示例9: normalize_newlines
def normalize_newlines(string):
out = string.strip()
out = re.sub(r'\r\n', '\n', out)
out = re.sub(r'\n{3,}', '\n\n', out)
out = re.sub(r'\n\s*\n', '\n\n', out)
out = re.sub(r'"$', '" ', out)
return out
开发者ID:crw,项目名称:python-textile,代码行数:7,代码来源:utils.py
示例10: rev_ip
def rev_ip(ip, delimiter=None):
revip = False
eip = expand_ip(ip)
prefix = False
if '/' in eip:
eip, prefix = regex.split('/', eip)[0:2]
else:
if is_ip4.search(eip):
prefix = '32'
elif is_ip6.search(eip):
prefix = '128'
if prefix:
prefix = int(prefix)
if is_ip4.search(eip):
if prefix in (8, 16, 24, 32):
revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
elif delimiter:
octs = eip.split('.')[::-1]
octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'
elif is_ip6.search(eip):
if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
elif delimiter:
nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'
return revip
开发者ID:cbuijs,项目名称:unbound-dns-filter,代码行数:32,代码来源:unbound-dns-filter.py
示例11: cleanTweet
def cleanTweet(tweet, query_term):
"""
"""
new_string = ''
for i in tweet.split(): # remove urls, hashtag characters, and full @username mentions
s, n, p, pa, q, f = urlparse.urlparse(i)
if s and n:
pass
elif i[:1] == '@':
pass
elif i[:1] == '#':
new_string = new_string.strip() + ' ' + i[1:]
else:
new_string = new_string.strip() + ' ' + i
table = string.maketrans("","") # make a translation table
new_string = re.sub("[^A-Za-z']+", ' ', new_string) # agressive and removes all non-alphanumeric (works only for latin-based and maybe only English)
new_string = new_string.replace(" amp ", " ") # remove html code for ampersands($)
new_string = new_string.lower() # lowercase entire tweet
new_string = re.sub(r'(.)\1+', r'\1\1', new_string) # reduces any char repition of > 2 to 2.
new_string = new_string.replace(query_term, " ") # take the original value used to collect tweets as a system argument, and remove it from tweets
new_string = re.sub(r'(?<!\S)\S{1}(?!\S)', '', new_string)
new_string = ' '.join(new_string.split()) # remove additional spaces
return new_string
开发者ID:asmiley,项目名称:tweet-emotion-classifier,代码行数:25,代码来源:tweet-soap.py
示例12: html_adhoc_fetcher
def html_adhoc_fetcher(url):
html = None
for _ in range(5):
opener = urllib2.build_opener()
TIME_OUT = 5
try:
html = opener.open(str(url), timeout = TIME_OUT).read()
except :
print('[WARN] Cannot access url with UnicodeEncodeError, try number is...', e, _, url, mp.current_process() )
continue
#print "b"
if html == None:
return None
line = html.replace('\n', '^A^B^C')
line = regex.sub('<!--.*?-->', '', line)
line = regex.sub('<style.*?/style>', '', line)
html = regex.sub('<script.*?/script>', '', line ).replace('^A^B^C', ' ')
#print "c"
soup = bs4.BeautifulSoup(html, "html.parser")
title = (lambda x:unicode(x.string) if x != None else 'Untitled')( soup.title )
contents0_text = (lambda x:x.text.encode('utf-8') if x != None else "" )( soup.find('div', {'class': 'ui-section-body'}) )
#contents0_text = "dummy"
links = set([a['href'] for a in soup.find_all('a', href=True)])
return title, contents0_text, links
开发者ID:GINK03,项目名称:KindleReferencedIndexScore,代码行数:25,代码来源:adhocSuumoJournalParser.py
示例13: normalize
def normalize(self, s):
s = re.sub(":","",s) # subtitle :
s = re.sub("-","",s) # subtitle -
s = re.sub(" "," ",s) # remove double space
s = re.sub("The ","",s) # remove prefix The
s = re.sub(", The","",s) # remove suffix ,The
return s
开发者ID:larrykoubiak,项目名称:gamedbpython,代码行数:7,代码来源:matcher.py
示例14: parse_text
def parse_text(element):
n = (element.attrib["_note"])
n = re.sub(r'[/]', '<br>', n)
n = re.sub(r'[(]', '<em><small>', n)
n = re.sub(r'[)]', '</small></em>', n)
prayer = n.strip().splitlines()
return prayer
开发者ID:bachrach44,项目名称:Sefaria-Project,代码行数:7,代码来源:Parse_Index_and_Version.py
示例15: parse_text
def parse_text(element):
n = element.attrib["_note"]
n = re.sub(r"[/]", "<br>", n)
n = re.sub(r"[(]", "<em><small>", n)
n = re.sub(r"[)]", "</small></em>", n)
prayer = n.strip().splitlines()
return prayer
开发者ID:rneiss,项目名称:Sefaria-Project,代码行数:7,代码来源:Parse_Index_and_Version.py
示例16: all_caps_text
def all_caps_text(s, site):
s = regex.sub("<[^>]*>", "", s) # remove HTML tags
s = regex.sub("&\w+;", "", s) # remove HTML entities
if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
return False, "" # common words in non-spam all-caps titles
if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
return True, "All in caps"
开发者ID:rekire,项目名称:SmokeDetector,代码行数:7,代码来源:findspam.py
示例17: main
def main():
args = parser.parse_args()
tt = TinyTokenizer()
for line in open(args.infile):
line=line.strip()
out = tt.tokenize(line)
outline = " ".join(out)
try:
assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
if args.conll:
for w in out:
print(w)
print()
else:
print(outline)
except:
print("==== CHECK FILE! ====", args.infile, file=sys.stderr)
print("+"*20, file=sys.stderr)
print("in: >>{}<<".format(line), file=sys.stderr)
print("out: >>{}<<".format(outline), file=sys.stderr)
print(str(regex.sub(r"\s","",line)), file=sys.stderr)
print(str(regex.sub(r"\s","",outline)), file=sys.stderr)
开发者ID:bplank,项目名称:multilingualtokenizer,代码行数:27,代码来源:tinytokenizer.py
示例18: main
def main():
transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing()
# Save lemma to translations found
found_translist = {}
try:
while (True):
scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE)
input_phrase = input("Enter Search Phrase> ")
if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "":
exit(0)
if (valid_search(input_phrase)):
search = search_phrase(input_phrase, "Latin")
# Find all the translations of the given words
for i in range(search.search_len):
search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist)
xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text)
print(scoreKeeper)
else:
print('Please enter a valid string\n')
except KeyboardInterrupt:
print('\nProgram Terminated\n')
sys.exit(0)
开发者ID:baileymiller,项目名称:intertextualityProject,代码行数:35,代码来源:search_by_phrase.py
示例19: clean_name
def clean_name(name):
"""
Cleans a show/movie name for searching.
:param name: release name
:return: cleaned name
"""
name = unicodedata.normalize('NFKD', name)
name = regex.sub('[._\-]', ' ', name)
name = regex.sub('[\':!"#*’,()?]', '', name)
name = regex.sub('\s{2,}', ' ', name)
name = regex.sub('\[.*?\]', '', name)
replace_chars = {
'$': 's',
'&': 'and',
'ß': 'ss'
}
for k, v in replace_chars.items():
name = name.replace(k, v)
name = CLEANING_REGEX.sub('', name)
return name.lower()
开发者ID:Murodese,项目名称:pynab,代码行数:27,代码来源:ids.py
示例20: scrape_wiki
def scrape_wiki():
url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"
page = requests.get(url)
soup_body = BeautifulSoup(page.text, "lxml")
tables = soup_body.select(".mw-parser-output > table")
pairs = []
links = []
for table in tables:
table_tr = table.select("tr")
for col in table_tr:
pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))
for pair in pairs:
if re.search(u'ספר|מספר', pair[0]):
continue
neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
chinukh = getGematria(pair[0])
print chinukh, rambam
chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
print neg_pos
link = ({"refs": [
u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
],
"type": "Sifrei Mitzvot",
"auto": True,
"generated_by": "chinukh_rambam_sfm_linker" # _sfm_linker what is this parametor intended to be?
})
print link['refs']
links.append(link)
return links
开发者ID:Sefaria,项目名称:Sefaria-Data,代码行数:35,代码来源:scraper_chinukh_rambam.py
注:本文中的regex.sub函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论