本文整理汇总了Python中regex.compile函数的典型用法代码示例。如果您正苦于以下问题:Python compile函数的具体用法?Python compile怎么用?Python compile使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了compile函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, start=None, end=None, void=None, structs=None):
self.start = start if start else re.compile(r"<(\w+).*?(?<!/)>")
self.end = end if end else re.compile(r"</(\w+)>")
self.void = void if void else re.compile(r"<(\w+).*?/>")
self.stags = set()
self.etags = set()
self.vtags = set()
开发者ID:dlukes,项目名称:pyvert,代码行数:7,代码来源:_pyvert.py
示例2: clean_line
def clean_line(line):
line = strip_nikkud(line)
replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
line = multiple_replace(line, replace_dict, using_regex=True)
# line = re.sub(u'[:\?]', '', line)
# line = re.sub(u'”', u'"', line)
reg_parentheses = re.compile(u'\((.*?)\)')
reg_brackets = re.compile(u'\[(.*?)\]')
in_per = reg_parentheses.search(line)
in_bra = reg_brackets.search(line)
reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
f_ayyen = re.search(reg_ayyen_tur, line)
f_lo_manu = re.search(reg_lo_manu, line)
if f_ayyen:
line = line[:f_ayyen.start()]
if f_lo_manu:
line = re.sub(f_lo_manu.group('a'), u"", line)
if in_per:
if in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
clean = re.sub(reg_parentheses, '', clean)
else:
clean = re.sub(reg_parentheses, ur'\1', line)
elif in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
else:
clean = line
return clean
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:32,代码来源:ein_parser.py
示例3: readConfigFile
def readConfigFile (
source # pathname to config file to read
):
# Purpose: read the configuration file at 'source', parse it,
# store values in a dictionary
# Returns: the dictionary parsed from 'source'
# Assumes: 'source' exists
# Effects: reads from the file system
# Throws: IOError if there are problems reading
fp = open (source, 'r')
lines = fp.readlines ()
fp.close ()
ignore_line = regex.compile ('[ \t]*#') # comment line
data_line = regex.compile ('[ \t]*'
'\([^ \t]+\)'
'[ \t]*\(.*\)')
dict = {}
for line in lines:
if ignore_line.match (line) == -1:
if data_line.match (line) != -1:
(parameter, value) = data_line.group (1,2)
dict [string.upper (parameter)] = value
return dict
开发者ID:mgijax,项目名称:websql,代码行数:26,代码来源:config.py
示例4: sample1
def sample1(filename, aft=None, fore=None, top=None, home=None):
doc = SeriesDocument('HTMLgen.rc')
doc.goprev,doc.gonext,doc.gotop,doc.gohome = aft,fore,top,home
doc.background = '../image/texturec.jpg'
doc.banner = ('../image/historic.gif', 472, 60)
doc.author = '1776 Thomas Jefferson'
doc.email = '[email protected]'
doc.logo = ('../image/eagle21.gif', 64, 54)
# parse Declaration of Independence
re_hline = regex.compile('^--+$')
re_title = regex.compile('^Title:\(.*$\)')
font2 = Font(size='+2')
s = open(os.path.join(datadir, 'DoI.txt')).read()
paragraphs = regsub.split(s, '\n\([\t ]*\n\)+')
for para in paragraphs:
if not para: continue
if re_title.search(para) > -1:
doc.title = re_title.group(1)
elif re_hline.search(para) > -1:
doc.append(HR())
else:
p = Paragraph( para )
# using \` to match beginning of paragraph
# ^ won't work because it'll match all the newlines
n = p.markup('\`\(\w\)', font2, reg_type='regex')
doc.append(p)
doc.write(os.path.join(htmldir, filename))
开发者ID:daveray,项目名称:soardoc,代码行数:27,代码来源:HTMLtest.py
示例5: updateline
def updateline(file, key, value, casefold = 1):
try:
f = open(file, 'r')
lines = f.readlines()
f.close()
except IOError:
lines = []
pat = key + ':\(.*\)\n'
if casefold:
prog = regex.compile(pat, regex.casefold)
else:
prog = regex.compile(pat)
if value is None:
newline = None
else:
newline = '%s: %s' % (key, value)
for i in range(len(lines)):
line = lines[i]
if prog.match(line) == len(line):
if newline is None:
del lines[i]
else:
lines[i] = newline
break
else:
if newline is not None:
lines.append(newline)
f = open(tempfile, 'w')
for line in lines:
f.write(line)
f.close()
开发者ID:asottile,项目名称:ancient-pythons,代码行数:31,代码来源:mhlib.py
示例6: test_post
def test_post(title, body, user_name, site, is_answer, body_is_summary):
result = []
for rule in FindSpam.rules:
body_to_check = body
if rule['stripcodeblocks']:
body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
if rule['all'] != (site in rule['sites']):
matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
if matched_title and rule['title']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "title"))
if matched_username and rule['username']:
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
result.append(rule['reason'])
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", "username"))
if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
type_of_post = "answer" if is_answer else "body"
try:
if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
result.append(rule['reason'].replace("{}", type_of_post))
except KeyError: # There is no special logic for this rule
result.append(rule['reason'].replace("{}", type_of_post))
return result
开发者ID:JC3,项目名称:SmokeDetector,代码行数:31,代码来源:findspam.py
示例7: tlg_plaintext_cleanup
def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):
"""Remove and substitute post-processing for Greek TLG text.
TODO: Surely more junk to pull out. Please submit bugs!
TODO: {.+?}|\(.+?\) working?
TODO: This is a rather slow now, help in speeding up welcome.
"""
remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1)
text = remove_comp.sub('', text)
new_text = None
if rm_punctuation:
new_text = ''
punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
if rm_periods:
punctuation += ['.', ';']
for char in text:
# second try at rming some punctuation; merge with above regex
if char in punctuation:
pass
else:
new_text += char
if new_text:
text = new_text
# replace line breaks w/ space
replace_comp = regex.compile(r'\n')
text = replace_comp.sub(' ', text)
comp_space = regex.compile(r'\s+')
text = comp_space.sub(' ', text)
return text
开发者ID:jfaville,项目名称:cltk,代码行数:32,代码来源:formatter.py
示例8: setliteral
def setliteral(self, tag):
self.literal = 1
re = "%s%s[%s]*%s" % (ETAGO, tag, string.whitespace, TAGC)
if self._normfunc is string.lower:
self._lit_etag_re = regex.compile(re, regex.casefold)
else:
self._lit_etag_re = regex.compile(re)
开发者ID:ashumeow,项目名称:grail,代码行数:7,代码来源:SGMLLexer.py
示例9: __init__
def __init__(self, src, javaFlag=0):
Doxy2SWIG.__init__(self, src, javaFlag)
""" Turns on the title, brief description and detailed description markup.
Turn them off when inside member documentatation.
"""
self.FilterTitle = True
self.sitkClassName=''
self.EmptyText = False
# compiled regular expressions
# common formula types in xml version of documentation
self.dollarFormula = re.compile("^\\$(.+)\\$$")
self.arrayFormula = re.compile("^\\\\\\[(.+)\\\\\\]$")
# more complex formula layout, that breaks R documentation
# checks.
self.mathstuff1 = re.compile(r"\\begin\{array\}\{[^}]+\}")
self.mathstuff2 = re.compile(r"\\begin\{array\}")
self.mathstuff3 = re.compile(r"\\end\{array\}")
# a complex recursive regular expression, to deal with formula
# inside mbox and text structures
self.mathstuff4 = regex.compile(r"\\mbox({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
self.mathstuff5 = regex.compile(r"\\text({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
# the special doxygen tags - note - not greedy
self.mathstuff6 = re.compile(r"\\f\$(.+?)\\f\$")
# alignment tags
self.mathstuff7 = re.compile(r" & ")
开发者ID:kaspermarstal,项目名称:SimpleElastix,代码行数:26,代码来源:doxy2swig.py
示例10: all_caps_text
def all_caps_text(s, site):
s = regex.sub("<[^>]*>", "", s) # remove HTML tags
s = regex.sub("&\w+;", "", s) # remove HTML entities
if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
return False, "" # common words in non-spam all-caps titles
if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
return True, "All in caps"
开发者ID:rekire,项目名称:SmokeDetector,代码行数:7,代码来源:findspam.py
示例11: _replace_for
def _replace_for(self, text, nested_position, keyword_number=1):
"""
Finds and replace the % for: ... % endfor loops
of the mail.template. It will create keyword records for
each loop found.
:param text: mail.template text
:param nested_position: counts how nested if the current pass
:param keyword_number: counts how many for we found
:return: simplified text without the if code, keywords found
"""
# Regex for finding text wrapped in loops
loop_regex = r'(% for .*?:$)(.*?)(% endfor)'
ul_loop_regex = r'(?:<ul[^<]*?)(% for .*?:$)(.*?)(% endfor)(.*?</ul>)'
# First scan for ul_loops
for_pattern = re.compile(ul_loop_regex, flags=re.DOTALL | re.MULTILINE)
simple_text, found_keywords = self._replace_for_type(
text, nested_position, keyword_number, 'for_ul', for_pattern)
keyword_number += len(found_keywords)
# Then scan for regular loops
for_pattern = re.compile(loop_regex, flags=re.DOTALL | re.MULTILINE)
simple_text, keywords = self._replace_for_type(
simple_text, nested_position, keyword_number, 'for', for_pattern)
found_keywords |= keywords
return simple_text, found_keywords
开发者ID:maxime-beck,项目名称:compassion-modules,代码行数:27,代码来源:communication_revision.py
示例12: __init__
def __init__(self, directory_name):
self.directory = directory_name
self.unigram_frequency = Counter()
self.trigrams = dict()
self.trigram_load_pattern = re2.compile(r'^([^ ]*) ([^ ]*) ([^\t]*)\t(\d*)')
self.middle_token_pattern = re2.compile(r'^\p{posix_alnum}*$', re2.UNICODE)
super(FileScorer, self).__init__()
开发者ID:o76923,项目名称:PyGTM,代码行数:7,代码来源:SimScorer.py
示例13: _reload_allowed_list_file
def _reload_allowed_list_file(self):
'''(Re)loads the list with rules for non-segment borders, e.g stops
the possible segment border being split (if not forced by a forcing
rule specified for the stop rule. The stop rules are pairs of two rules
of which the first is matched against the segment to the left, and the
latter is matched against the segment to the right. The filename is
given in the __init__, and the default file is "./data/stop_list".
See the __init__() and segment() function for more about the algorithm.
ATTENTION note that verbose regexps are used.'''
with open(self._allowed_list_filename, 'r') as f:
_filedata = f.readlines()
self._allowed_regexps = list()
_rule_left = ''
_rule_right = ''
for i in range(len(_filedata)):
# rules must be specified in correct order: first left, then right
if _filedata[i].startswith('LEFT:'):
_rule_left = regex.compile(_filedata[i][5:], regex.VERBOSE)
elif _filedata[i].startswith('RIGHT:'):
_rule_right = regex.compile(_filedata[i][6:], regex.VERBOSE)
self._allowed_regexps.append((_rule_left, _rule_right))
_rule_left = ''
_rule_right = ''
else:
# everything else is ignored
continue
开发者ID:kristiank,项目名称:Lausestaja,代码行数:30,代码来源:ortographicsegmenter.py
示例14: __init__
def __init__(self):
# These attributes are set by the parse method
self.doc = None
self.para = None
self.current_string = None
self.flow = None
self.stateMachine = StateMachine()
self.stateMachine.add_state("PARA", self._para)
self.stateMachine.add_state("ESCAPE", self._escape)
self.stateMachine.add_state("END", None, end_state=1)
self.stateMachine.add_state("ANNOTATION-START", self._annotation_start)
self.stateMachine.add_state("CITATION-START", self._citation_start)
self.stateMachine.add_state("BOLD-START", self._bold_start)
self.stateMachine.add_state("ITALIC-START", self._italic_start)
self.stateMachine.add_state("CODE-START", self._code_start)
self.stateMachine.add_state("QUOTES-START", self._quotes_start)
self.stateMachine.add_state("INLINE-INSERT", self._inline_insert)
self.stateMachine.add_state("CHARACTER-ENTITY", self._character_entity)
self.stateMachine.set_start("PARA")
self.patterns = {
'escape': re.compile(r'\\', re.U),
'escaped-chars': re.compile(r'[\\\(\{\}\[\]_\*,\.\*`"&]', re.U),
'annotation': re.compile(
r'(?<!\\)\{(?P<text>.*?)(?<!\\)\}(\(\s*(?P<type>\S*?\s*[^\\"\']?)(["\'](?P<specifically>.*?)["\'])??\s*(\((?P<namespace>\w+)\))?\s*(~(?P<language>[\w-]+))?\))?', re.U),
'bold': re.compile(r'\*(?P<text>((?<=\\)\*|[^\*])*)(?<!\\)\*', re.U),
'italic': re.compile(r'_(?P<text>((?<=\\)_|[^_])*)(?<!\\)_', re.U),
'code': re.compile(r'`(?P<text>(``|[^`])*)`', re.U),
'quotes': re.compile(r'"(?P<text>((?<=\\)"|[^"])*)(?<!\\)"', re.U),
'inline-insert': re.compile(r'>\((?P<attributes>.*?)\)', re.U),
'character-entity': re.compile(r'&(\#[0-9]+|#[xX][0-9a-fA-F]+|[\w]+);'),
'citation': re.compile(r'(\[\s*\*(?P<id>\S+)(\s+(?P<id_extra>.+?))?\])|(\[\s*\#(?P<name_name>\S+)(\s+(?P<extra>.+?))?\])|(\[\s*(?P<citation>.*?)\])', re.U)
}
开发者ID:dustinrb,项目名称:sam,代码行数:33,代码来源:samparser.py
示例15: clean_line
def clean_line(line):
line = strip_nikkud(line)
line = re.sub(u':', '', line)
reg_parentheses = re.compile(u'\((.*?)\)')
reg_brackets = re.compile(u'\[(.*?)\]')
in_per = reg_parentheses.search(line)
in_bra = reg_brackets.search(line)
reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
pos = re.search(reg_ayyen_tur, line)
if pos:
line = line[:pos.start()]
if in_per:
if in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
clean = re.sub(reg_parentheses, '', clean)
else:
clean = re.sub(reg_parentheses, ur'\1', line)
elif in_bra:
clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct
else:
clean = line
return clean
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:26,代码来源:basic_ein_parser.py
示例16: add_spaces
def add_spaces(text, exclude=None):
if exclude:
patt_exclude = regex.escape(exclude)
patt_eng_cjk = regex.compile(u"([[%s]--%s])([%s])" % (CHAR_ENG_LEFT, patt_exclude, CHAR_CJK))
patt_cjk_eng = regex.compile(u"([%s])([[%s]--%s])" % (CHAR_CJK, CHAR_ENG_RIGHT, patt_exclude))
else:
patt_eng_cjk = PATTERN_ENG_CJK
patt_cjk_eng = PATTERN_CJK_ENG
def add_space_func(index1, index2):
def add_space(match):
return u"%s %s" % (match.group(index1), match.group(index2))
return add_space
text = patt_cjk_eng.subn(add_space_func(1, 2), text)[0]
text = patt_eng_cjk.subn(add_space_func(1, 2), text)[0]
if not (exclude and '"' in exclude):
# XXX"YYY"XXX -> XXX "YYY" XXX
# where X and Y are CJK charaters
is_left_dquote = True
is_left_squote = True
out = StringIO.StringIO()
for i in xrange(len(text)):
prev_char = text[i - 1] if i > 0 else None
cur_char = text[i]
next_char = text[i + 1] if i < len(text) - 1 else None
if cur_char == u'"':
if is_left_dquote:
if _is_cjk(prev_char):
out.write(u' "')
else:
out.write(u'"')
is_left_dquote = False
else:
if _is_cjk(next_char):
out.write(u'" ')
else:
out.write(u'"')
is_left_dquote = True
elif cur_char == u"'":
if is_left_squote:
if _is_cjk(prev_char):
out.write(u" '")
else:
out.write(u"'")
is_left_squote = False
else:
if _is_cjk(next_char):
out.write(u"' ")
else:
out.write(u"'")
is_left_squote = True
else:
out.write(cur_char)
text = out.getvalue()
out.close()
return text
开发者ID:eliangcs,项目名称:bisheng,代码行数:60,代码来源:spacing.py
示例17: expand_parens
def expand_parens(str, include_spaces=False):
output = []
if "‣" in str:
for i in str.split("‣"):
output.extend(expand_parens(i))
return output
if include_spaces:
regex1 = re.compile(r"(^.*)\((.+)\)(.*$)")
regex2 = re.compile(r"(^.*)\((.+)\)(.*$)")
else:
regex1 = re.compile(r"(^.*[^ ])\(([^ ]+)\)(.*$)")
regex2 = re.compile(r"(^.*)\(([^ ]+)\)([^ ].*$)")
re_match1 = regex1.search(str)
re_match2 = regex2.search(str)
if re_match1:
within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
without = re_match1.group(1) + re_match1.group(3)
elif re_match2:
within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
without = re_match2.group(1) + re_match2.group(3)
else:
return [str]
output = [clean_str(without), clean_str(within)]
return output
开发者ID:StevenLOL,项目名称:panlex-python-tools,代码行数:29,代码来源:PanlexTools.py
示例18: makeconfig
def makeconfig(infp, outfp, modules, with_ifdef=0):
m1 = regex.compile('-- ADDMODULE MARKER 1 --')
m2 = regex.compile('-- ADDMODULE MARKER 2 --')
while 1:
line = infp.readline()
if not line: break
outfp.write(line)
if m1 and m1.search(line) >= 0:
m1 = None
for mod in modules:
if mod in never:
continue
if with_ifdef:
outfp.write("#ifndef init%s\n"%mod)
outfp.write('extern void init%s();\n' % mod)
if with_ifdef:
outfp.write("#endif\n")
elif m2 and m2.search(line) >= 0:
m2 = None
for mod in modules:
if mod in never:
continue
outfp.write('\t{"%s", init%s},\n' %
(mod, mod))
if m1:
sys.stderr.write('MARKER 1 never found\n')
elif m2:
sys.stderr.write('MARKER 2 never found\n')
开发者ID:Claruarius,项目名称:stblinux-2.6.37,代码行数:28,代码来源:makeconfig.py
示例19: compileRegex
def compileRegex(string, flags):
try:
return regex.compile(string, convertRegex(flags))
except:
for od in HEXADECIMAL_PATTERNS:
string = string.replace(od[0], od[1])
return regex.compile(string, convertRegex(flags))
开发者ID:D3f0,项目名称:prymatex,代码行数:7,代码来源:base.py
示例20: __init__
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None:
"""
Builds a codec converting between graphemes/code points and integer
label sequences.
charset may either be a string, a list or a dict. In the first case
each code point will be assigned a label, in the second case each
string in the list will be assigned a label, and in the final case each
key string will be mapped to the value sequence of integers. In the
first two cases labels will be assigned automatically.
As 0 is the blank label in a CTC output layer, output labels and input
dictionaries are/should be 1-indexed.
Args:
charset (unicode, list, dict): Input character set.
"""
if isinstance(charset, dict):
self.c2l = charset
else:
self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
# map integer labels to code points because regex only works with strings
self.l2c = {} # type: Dict[str, str]
for k, v in self.c2l.items():
self.l2c[''.join(chr(c) for c in v)] = k
# sort prefixes for c2l regex
self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True)))
# sort prefixes for l2c regex
self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
开发者ID:mittagessen,项目名称:kraken,代码行数:30,代码来源:codec.py
注:本文中的regex.compile函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论