• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python regex.sub函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中regex.sub函数的典型用法代码示例。如果您正苦于以下问题:Python sub函数的具体用法?Python sub怎么用?Python sub使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了sub函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: clean_text

 def clean_text(text):
     clear_text_regexp = re.compile(r'(?u)\w+|[,.!?]')
     text_ = " ".join(clear_text_regexp.findall(text)).replace(" .", ".").replace(" ,", ",")
     text_ = re.sub("[,]+", ",", text_)
     text_ = re.sub("[.]+", ".", text_)
     text_ = re.sub("\s+", " ", text_)
     return text_
开发者ID:RenatKhayrullin,项目名称:Diploma,代码行数:7,代码来源:TextCleaner.py


示例2: transform

 def transform(self, text):
     for pattern, replace in self.pattern_replace_pair_list:
         try:
             text = regex.sub(pattern, replace, text)
         except:
             pass
     return regex.sub(r"\s+", " ", text).strip()
开发者ID:amsqr,项目名称:Kaggle_HomeDepot,代码行数:7,代码来源:data_processor.py


示例3: parse_implied_depth

    def parse_implied_depth(self, element):
        ja_depth_pattern = ur"\[(\d)\]$"
        ja_sections_pattern = ur"\[(.*)\]$"
        title_str = element.get('text').strip()

        depth_match = re.search(ja_depth_pattern, title_str)
        if depth_match:
            depth = int(depth_match.group(1))
            placeholder_sections = ['Volume', 'Chapter', 'Section', 'Paragraph']
            element.set('text', re.sub(ja_depth_pattern, "", title_str))
            return {'section_names': placeholder_sections[(-1 * depth):], 'address_types' : ['Integer'] * depth}

        sections_match = re.search(ja_sections_pattern, title_str)
        if sections_match:
            sections = [s.strip() for s in sections_match.group(1).split(",")]
            element.set('text', re.sub(ja_sections_pattern, "", title_str))
            section_names = []
            address_types = []
            for s in sections:
                tpl = s.split(":")
                section_names.append(tpl[0])
                address_types.append(tpl[1] if len(tpl) > 1 else 'Integer')

            return {'section_names': section_names, 'address_types' : address_types}
        else:
            return None
开发者ID:JonMosenkis,项目名称:Sefaria-Project,代码行数:26,代码来源:parse_index_and_version.py


示例4: test_post

 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result
开发者ID:JC3,项目名称:SmokeDetector,代码行数:31,代码来源:findspam.py


示例5: remove_article

    def remove_article(self, text):
        for art in self.articles:
            text = re.sub("^\s*\m%s\M\s*" % art, " ", text)

        text = re.sub("\s*\mο\M", "", text)
        text = re.sub("\s*\mείναι\M", "", text)
        return text.strip()
开发者ID:garykpdx,项目名称:panlex-tools,代码行数:7,代码来源:greek.py


示例6: lcc_range

def lcc_range(string):
    """
    Takes a string, returns a tuple of two LCClassNumbers, the start and
    end of the range.
    """
    string = string.encode("ascii","replace")
    string = string.replace("(","")
    string = string.replace(")","")
    if string.endswith("A-Z"):
        # TMI in the schedules when they're alphabetical.
        # I don't care.
        string.replace("A-Z","")

    if "-" not in string:
        # A range of self length.
        return (LCCallNumber(string), LCCallNumber(string))

    parts = string.split("-")
    if re.search(r"^\d",parts[1]):
        header = re.sub("^([A-Z]+).*",r"\1",parts[0])
    elif re.search(r"^\.",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..*",r"\1",parts[0])
    elif re.search(r"^[A-Z]",parts[1]):
        header = re.sub(r"^([A-Z]+\d)+\..[A-Z]*",r"\1.",parts[0])            
    else:
        header = " "

    parts[1] = header + parts[1]
    return (
        LCCallNumber(parts[0]),
        LCCallNumber(parts[1])
    )
开发者ID:Bookworm-project,项目名称:Bookworm-MARC,代码行数:32,代码来源:bookwormMARC.py


示例7: tei_spellcheck

def tei_spellcheck(facsimile, dictionary, deletion_dictionary,
                   filter_punctuation=False):
    """
    Performs a spell check on an TEI XML document.

    Each ``seg`` element is treated as a single word and spelling corrections
    will be inserted using a choice tag. Correct words will be untouched and
    correction candidates will be sorted by edit distance.

    Args:
        facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object.
        dictionary (unicode): Path to a base dictionary.
        deletion_dictionary (unicode): Path to a deletion dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   segments.

    Returns:
        A TEIFacsimile object containing the spelling corrections.
    """
    text_tokens = [x[-1] for x in facsimile.segments]
    if filter_punctuation:
        text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens]
    suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary)
    facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein')
    for segment in facsimile.segments:
        key = alg.sanitize(segment[-1])
        if filter_punctuation:
            key = regex.sub('[^\w]', '', key)
        if key not in suggestions:
            continue
        for sugg in suggestions[key]:
            facsimile.add_choices(segment[-2], [(sugg, 100 - 10 *
                                  alg.edit_distance(key, sugg))])
    return facsimile
开发者ID:amitdo,项目名称:nidaba,代码行数:34,代码来源:lex.py


示例8: fix_broken_paragraphs

def fix_broken_paragraphs(in_bytes):   
    out = in_bytes
    out = regex.sub(rb'''(?<=\p{lower}\s*)</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'''(?<=\p{lower}\s*)
                        <p[^>]*>(?=\s*\p{lower})''', 
                        b' ',
                        out, flags=regex.VERBOSE|regex.I)
    
    # Deal with a wrong paragraph break on a hyphenated word
    # (v.ugly)
    out = regex.sub(rb'''(?<=\p{lower})-</(blockquote|p|div)>
                        \s*
                        <\1[^>]*>\s*(?=\p{lower})''', 
                        b'',
                        out, flags=regex.VERBOSE|regex.I)
    
    out = regex.sub(rb'(?<=\p{lower})-<p[^>]*>(?=\s*\p{lower})', 
                        b'',
                        out, regex.I)
    return out
开发者ID:jgpacker,项目名称:suttacentral,代码行数:25,代码来源:__init__.py


示例9: normalize_newlines

def normalize_newlines(string):
    out = string.strip()
    out = re.sub(r'\r\n', '\n', out)
    out = re.sub(r'\n{3,}', '\n\n', out)
    out = re.sub(r'\n\s*\n', '\n\n', out)
    out = re.sub(r'"$', '" ', out)
    return out
开发者ID:crw,项目名称:python-textile,代码行数:7,代码来源:utils.py


示例10: rev_ip

def rev_ip(ip, delimiter=None):
    revip = False
    eip = expand_ip(ip)
    prefix = False

    if '/' in eip:
        eip, prefix = regex.split('/', eip)[0:2]
    else:
        if is_ip4.search(eip):
            prefix = '32'
        elif is_ip6.search(eip):
            prefix = '128'

    if prefix:
        prefix = int(prefix)
        if is_ip4.search(eip):
            if prefix in (8, 16, 24, 32):
                revip = '.'.join(eip.split('.')[0:int(prefix / 8)][::-1]) + '.in-addr.arpa.'
            elif delimiter:
                octs = eip.split('.')[::-1]
                octs[3 - int(prefix / 8)] = octs[3 - int(prefix / 8)] + delimiter + str(prefix)
                revip = '.'.join(octs[3 - int(prefix / 8):]) + '.in-addr.arpa.'

        elif is_ip6.search(eip):
            if prefix in (4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128):
                revip = '.'.join(filter(None, regex.split('(.)', regex.sub(':', '', eip))))[0:(prefix / 4) * 2][::-1].strip('.') + '.ip6.arpa.'
            elif delimiter:
                nibs = filter(None, regex.split('(.)', regex.sub(':', '', eip)))[::-1]
                nibs[31 - int(prefix / 4)] = nibs[31 - int(prefix /4)] + delimiter + str(prefix)
                revip = '.'.join(nibs[31 - int(prefix /4):]) + '.ip6.arpa.'

    return revip
开发者ID:cbuijs,项目名称:unbound-dns-filter,代码行数:32,代码来源:unbound-dns-filter.py


示例11: cleanTweet

def cleanTweet(tweet, query_term):
    """
    """
    new_string = ''
    for i in tweet.split(): # remove urls, hashtag characters, and full @username mentions
        s, n, p, pa, q, f = urlparse.urlparse(i)
        if s and n:
            pass
        elif i[:1] == '@':
            pass
        elif i[:1] == '#':
            new_string = new_string.strip() + ' ' + i[1:]
        else:
            new_string = new_string.strip() + ' ' + i

    table = string.maketrans("","") # make a translation table
    new_string = re.sub("[^A-Za-z']+", ' ', new_string) # agressive and removes all non-alphanumeric (works only for latin-based and maybe only English)
    new_string = new_string.replace(" amp ", " ") # remove html code for ampersands($)
    new_string = new_string.lower() # lowercase entire tweet
    new_string = re.sub(r'(.)\1+', r'\1\1', new_string) # reduces any char repition of > 2 to 2.
    new_string = new_string.replace(query_term, " ") # take the original value used to collect tweets as a system argument, and remove it from tweets
    new_string = re.sub(r'(?<!\S)\S{1}(?!\S)', '', new_string)
    new_string = ' '.join(new_string.split()) # remove additional spaces

    return new_string
开发者ID:asmiley,项目名称:tweet-emotion-classifier,代码行数:25,代码来源:tweet-soap.py


示例12: html_adhoc_fetcher

def html_adhoc_fetcher(url):
    html = None
    for _ in range(5):
        opener = urllib2.build_opener()
        TIME_OUT = 5
        try:
            html = opener.open(str(url), timeout = TIME_OUT).read()
        except :
            print('[WARN] Cannot access url with UnicodeEncodeError, try number is...', e, _, url, mp.current_process() )
            continue
    #print "b"
    if html == None:
        return None
    line = html.replace('\n', '^A^B^C')
    line = regex.sub('<!--.*?-->', '',  line)
    line = regex.sub('<style.*?/style>', '',  line)
    html = regex.sub('<script.*?/script>', '', line ).replace('^A^B^C', ' ')
 
    #print "c"
    soup = bs4.BeautifulSoup(html, "html.parser")
    title = (lambda x:unicode(x.string) if x != None else 'Untitled')( soup.title )
    contents0_text = (lambda x:x.text.encode('utf-8') if x != None else "" )( soup.find('div', {'class': 'ui-section-body'}) )
    #contents0_text = "dummy"
    links = set([a['href'] for a in soup.find_all('a', href=True)])
    return title, contents0_text, links
开发者ID:GINK03,项目名称:KindleReferencedIndexScore,代码行数:25,代码来源:adhocSuumoJournalParser.py


示例13: normalize

 def normalize(self, s):
     s = re.sub(":","",s) # subtitle :
     s = re.sub("-","",s) # subtitle -
     s = re.sub("  "," ",s) # remove double space
     s = re.sub("The ","",s) # remove prefix The      
     s = re.sub(", The","",s) # remove suffix ,The
     return s        
开发者ID:larrykoubiak,项目名称:gamedbpython,代码行数:7,代码来源:matcher.py


示例14: parse_text

def parse_text(element):
    n = (element.attrib["_note"])
    n = re.sub(r'[/]', '<br>', n)
    n = re.sub(r'[(]', '<em><small>', n)
    n = re.sub(r'[)]', '</small></em>', n)
    prayer = n.strip().splitlines()
    return prayer
开发者ID:bachrach44,项目名称:Sefaria-Project,代码行数:7,代码来源:Parse_Index_and_Version.py


示例15: parse_text

def parse_text(element):
    n = element.attrib["_note"]
    n = re.sub(r"[/]", "<br>", n)
    n = re.sub(r"[(]", "<em><small>", n)
    n = re.sub(r"[)]", "</small></em>", n)
    prayer = n.strip().splitlines()
    return prayer
开发者ID:rneiss,项目名称:Sefaria-Project,代码行数:7,代码来源:Parse_Index_and_Version.py


示例16: all_caps_text

def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"
开发者ID:rekire,项目名称:SmokeDetector,代码行数:7,代码来源:findspam.py


示例17: main

def main():

    args = parser.parse_args()

    tt = TinyTokenizer()

    for line in open(args.infile):
        line=line.strip()

        out = tt.tokenize(line)
        outline = " ".join(out)
        try:
            assert(str(regex.sub(r"\s","",line))==str(regex.sub("\s","",outline)))
            if args.conll:
                for w in out:
                    print(w)
                print()
            else:
                print(outline)
            
        except:
            print("==== CHECK FILE! ====",  args.infile, file=sys.stderr)
            print("+"*20, file=sys.stderr)
            print("in:  >>{}<<".format(line), file=sys.stderr)
            print("out: >>{}<<".format(outline), file=sys.stderr)     
            print(str(regex.sub(r"\s","",line)), file=sys.stderr)
            print(str(regex.sub(r"\s","",outline)), file=sys.stderr)
开发者ID:bplank,项目名称:multilingualtokenizer,代码行数:27,代码来源:tinytokenizer.py


示例18: main

def main():

	transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing()

	# Save lemma to translations found
	found_translist = {}

	try:
		while (True):

			scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE)

			input_phrase = input("Enter Search Phrase>  ")

			if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "":
				exit(0)

			if (valid_search(input_phrase)):
				
				search = search_phrase(input_phrase, "Latin")

				# Find all the translations of the given words
				for i in range(search.search_len):
					search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist)
		
				xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text)

				print(scoreKeeper)

			else:
				print('Please enter a valid string\n')

	except KeyboardInterrupt:
		print('\nProgram Terminated\n')
		sys.exit(0)
开发者ID:baileymiller,项目名称:intertextualityProject,代码行数:35,代码来源:search_by_phrase.py


示例19: clean_name

def clean_name(name):
    """
    Cleans a show/movie name for searching.

    :param name: release name
    :return: cleaned name
    """

    name = unicodedata.normalize('NFKD', name)

    name = regex.sub('[._\-]', ' ', name)
    name = regex.sub('[\':!"#*’,()?]', '', name)
    name = regex.sub('\s{2,}', ' ', name)
    name = regex.sub('\[.*?\]', '', name)

    replace_chars = {
        '$': 's',
        '&': 'and',
        'ß': 'ss'
    }

    for k, v in replace_chars.items():
        name = name.replace(k, v)

    name = CLEANING_REGEX.sub('', name)

    return name.lower()
开发者ID:Murodese,项目名称:pynab,代码行数:27,代码来源:ids.py


示例20: scrape_wiki

def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links
开发者ID:Sefaria,项目名称:Sefaria-Data,代码行数:35,代码来源:scraper_chinukh_rambam.py



注:本文中的regex.sub函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python regexUtils.findall函数代码示例发布时间:2022-05-26
下一篇:
Python regex.split函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap