本文整理汇总了Python中regex.finditer函数的典型用法代码示例。如果您正苦于以下问题:Python finditer函数的具体用法?Python finditer怎么用?Python finditer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了finditer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _match_by_edit_distance
def _match_by_edit_distance(full_text, text_to_match):
text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")")
text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}")
text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match)
try:
end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match))
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)]
except:
import sys
print(full_text)
print()
print(text_to_match)
sys.exit(1)
if len(potential_matches) == 0:
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
if len(potential_matches) == 0:
text_to_match = text_to_match.replace("(", "[")
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
potential_matches = [(p[0:p.rindex(text_to_match[-1])+1]
if text_to_match[-1] in p and len(p) > len(text_to_match)
else p)
for p in potential_matches]
if len(potential_matches) == 0:
# No idea why this would ever happen, but it does
return text_to_match
match_with_lowest_edit_distance = ""
lowest_edit_distance = -1
for match in potential_matches:
e_d = edit_distance(match, text_to_match)
if lowest_edit_distance == -1 or e_d <= lowest_edit_distance:
lowest_edit_distance = e_d
match_with_lowest_edit_distance = match
result = match_with_lowest_edit_distance.strip()
if text_to_match[-1] in result:
while result[-1] != text_to_match[-1]:
result = result[0:-1]
elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result):
while result[-1] not in ['"', '”', "\u201d"]:
result = result[0:-1]
elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..":
while result[-1] != text_to_match[-1]:
result += full_text[full_text.index(result) + len(result)][-1]
return result
开发者ID:EducationalTestingService,项目名称:match,代码行数:54,代码来源:Match.py
示例2: _sustituirReg
def _sustituirReg(self,aux,regOrig,regDest):
#Resulta necesario el uso de regex que es mas potente y en ambos sentidos
regAux="^"+regOrig+"$"
for m in regex.finditer(regAux,aux,overlapped=True):
if self.parentesisCoherentes1(m.groupdict()):
aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n"
return aux2
regAux="(?r)^"+regOrig+"$"
for m in regex.finditer(regAux,aux,overlapped=True):
if self.parentesisCoherentes1(m.groupdict()):
aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n"
return aux2
return aux
开发者ID:yeboyebo,项目名称:AQNext,代码行数:13,代码来源:Flujo.py
示例3: search_strand
def search_strand(pattern, sequence_to_scan, strand=1):
'''
take a sequence pattern (element) and find occurrences of that on the
provided, larger 5'-->3' sequence.
Assumes strand is first unless provided.
Tracks the start and end points of each occurrence, returning a list of
that information where each element is a tuple of the start and end points
along with the strand.
Works with overlapped sequences because now
"regex.findall and regex.finditer support an ‘overlapped’ flag which
permits overlapped matches."
, see https://pypi.python.org/pypi/regex/2018.02.21
based on https://www.biostars.org/p/209383/ (specifically steve's answer)
'''
occurrences = []
for match in regex.finditer(
pattern.upper(), str(sequence_to_scan.upper()),overlapped=True):
if strand == 1:
start_pos = match.start() + 1
end_pos = match.end() + 1
else:
start_pos = (len(sequence_to_scan) - match.start() ) + 1
end_pos = (len(sequence_to_scan) - match.end() ) + 1
# print (start_pos, '\t', end_pos, '\t',strand) # for debugging
occurrences.append((start_pos, end_pos,strand))
return occurrences
开发者ID:fomightez,项目名称:sequencework,代码行数:29,代码来源:find_sequence_element_occurrences_in_sequence.py
示例4: plx_wrapper
def plx_wrapper(text):
before = text
text = delimToPanlex(text)
idx_list = [ex_match.start() for ex_match in re.finditer('⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸', text)]
if len(idx_list) == 0:
return process_synonyms(proc)(text)
idx_list.append( len(text))
if len(text[ 0:idx_list[0] ].strip()) > 0:
idx_list.insert(0,0)
final_exp = []
for idx in range(len(idx_list) - 1):
ex = text[ idx_list[idx] : idx_list[idx+1]]
tag,ex_text,attributes = get_plx_fields(ex)
result = proc(ex_text)
result_match = re.search('(⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸)(.+)', result)
if result_match:
if len(result_match[1].strip()) > 0:
final_exp.append('%s%s' % (result,attributes))
else:
if len(result.strip()) > 0:
final_exp.append('%s%s%s' % (tag,result,attributes))
final_exp = filter_unique_meanings(final_exp)
text = ''.join(final_exp)
return text
开发者ID:longnow,项目名称:panlex-tools,代码行数:29,代码来源:source.py
示例5: __init__
def __init__(self, names, features={}, ftstr='', weights=None):
"""Construct a `Segment` object
Args:
names (list): ordered list of feature names
features (dict): name-value pairs for specified features
ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is
interpreted as a feature specification
weights (float): order list of feature weights/saliences
"""
self.n2s = {-1: '-', 0: '0', 1: '+'}
self.s2n = {k: v for (v, k) in self.n2s.items()}
self.names = names
"""Set a feature specification"""
self.data = {}
for name in names:
if name in features:
self.data[name] = features[name]
else:
self.data[name] = 0
for m in re.finditer(r'(\+|0|-)(\w+)', ftstr):
v, k = m.groups()
self.data[k] = self.s2n[v]
if weights:
self.weights = weights
else:
self.weights = [1 for _ in names]
开发者ID:dmort27,项目名称:panphon,代码行数:27,代码来源:segment.py
示例6: tokenize
def tokenize(self, value):
"""
Perform the tokenizing.
Required Argument
value -- The unicode string to tokenize.
"""
t = Token() # The token instance we will reuse
if not self._gaps:
# The default: expression matches are used as tokens
for pos, match in enumerate(self._regexp.finditer(value)):
yield t.update(match.group(0), index=(match.start(), match.end(),), position=pos)
else:
# When gaps=True, iterate through the matches and
# yield the text between them.
left = 0
last_pos = 0
for pos, match in enumerate(regex.finditer(self._regexp, value)):
right, next = match.span()
if right != 0:
yield t.update(value[left:right], position=pos, index=(left, right,))
left = next
last_pos = pos
if left != len(value):
yield t.update(value[left:], position=last_pos+1, index=(left, len(value),))
开发者ID:Kapiche,项目名称:caterpillar,代码行数:26,代码来源:tokenize.py
示例7: determine_match
def determine_match(commentary_name, commentary_regex):
issues = 0
full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(commentary_regex)
full_mechaber = Root('../../Even_HaEzer.xml').get_base_text()
error_counter = Counter()
for siman_num, siman in enumerate(full_mechaber.get_simanim()):
for seif_num, seif in enumerate(siman.get_child()):
matches = regex.finditer(full_pattern, unicode(seif))
for regex_match in matches:
c_ref = Ref(u'{} {}:{}'.format(commentary_name, siman_num+1, getGematria(regex_match.group('ref'))))
try:
c_text = c_ref.text('he').text.split()[0]
except IndexError:
continue
c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text)
dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',regex_match.group('dh'))
ratio = fuzz.ratio(dh_text, c_text)
if ratio < 75.0:
issues += 1
print u"Potential mismatch:"
print u"Shulchan Arukh, Even HaEzer {}:{} {}".format(siman_num+1, seif_num+1, dh_text)
print u"{} {}".format(c_ref.normal(), c_text)
print u"Score: {}".format(ratio)
error_counter[(dh_text, c_text)] += 1
print u"Total issues: {}".format(issues)
return error_counter
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:30,代码来源:validate_existing_commentaries.py
示例8: tokenize
def tokenize(text):
"""
Yield tokens.
Args:
text (str): The original text.
Yields:
dict: The next token.
"""
stem = SnowballStemmer('norwegian').stem
tokens = regex.finditer('\p{L}+', text.lower())
for offset, match in enumerate(tokens):
# Get the raw token.
unstemmed = match.group(0)
yield { # Emit the token.
'stemmed': stem(unstemmed),
'unstemmed': unstemmed,
'offset': offset
}
开发者ID:kmelve,项目名称:textplot,代码行数:26,代码来源:utils.py
示例9: _match
def _match(self, text):
matches = []
if self.mapping:
seq = self.map.keys()
else:
seq = self.regex_sequence
for r in seq:
for matchobj in re.finditer(r, text, overlapped=True):
groups = (matchobj.groupdict())
result = {
'start': matchobj.start(),
'end': matchobj.end(),
'regex': r,
'groups':groups
}
if self.mapping:
for k, v in self.map[r].items():
if k not in result.keys():
result[k] = v
matches.append(
result
)
return matches
开发者ID:estnltk,项目名称:estnltk,代码行数:27,代码来源:event_tagger.py
示例10: parse_ind_vars
def parse_ind_vars(self):
"""Define values of independent variables by parsing first example of form:
[var name] [value 0]
[value 1]
...
[value n]
[blank]
*or*
[text]
:return:
"""
selem = np.array([[1, 0],
[1, 1]])
# some weird bug with np.pad and string dtype
s_type = 'S%d' % (max([len(x) for y in self.sheet for x in y]) + 10)
xs_values = np.zeros(np.array(self.sheet.shape) + 1, dtype=s_type)
xs_values[:-1, :-1] = self.sheet
mask = (xs_values[:, :2] != '').astype(int)
mask[:, 1] *= 2
mask_string = ''.join(['ABCD'[i] for i in mask.sum(axis=1)])
ind_vars = {}
for x in re.finditer('(DC+)[ABD]', mask_string):
name = xs_values[x.span()[0], 0]
values = xs_values[x.span()[0]:x.span()[1] - 1, 1]
ind_vars[name] = list(values)
self.ind_vars.update(ind_vars)
开发者ID:feldman4,项目名称:lasagna,代码行数:29,代码来源:conditions_.py
示例11: parse
def parse(self, data, regex = None, encoding = "utf-8"):
regex = regex or self.master
is_unicode = appier.legacy.is_unicode(data)
if not is_unicode: data = data.decode(encoding)
nodes = []
matches = regex.finditer(data)
current = 0
for match in matches:
name = match.lastgroup
parts = match.groupdict()
start, end = match.span()
if start > current:
value = data[current:start]
value = value.replace("\r", "")
value = value.replace("\n", " ")
if value: nodes.append(value)
method = getattr(self, "parse_" + name)
node = method(parts)
nodes.append(node)
current = end
remaining = data[current:]
remaining = remaining.replace("\r", "")
remaining = remaining.replace("\n", " ")
if remaining: nodes.append(remaining)
return nodes
开发者ID:hivesolutions,项目名称:appier_extras,代码行数:34,代码来源:markdown.py
示例12: findall_p_in_s
def findall_p_in_s(p,s):
""""returns a series of matches for a pattern (p) in a str (s)"""""
match_strs = regex.findall(p,s)
#get pairs of left and right indexes
match_indexes = [(i.start(0),i.end(0)) for i in regex.finditer(p,s)]
all_p_in_s = [Match(match_strs[i],match_indexes[i][0],match_indexes[i][1]) for i in range(0,len(match_strs))]
return all_p_in_s
开发者ID:apathinwalking,项目名称:tidyaddr,代码行数:7,代码来源:match.py
示例13: _find
def _find(self, *args):
global index
try:
for match in regex.finditer(u'^.+?:\d+?:.*%s.*$' % args[0], index.data, regex.MULTILINE | regex.IGNORECASE | regex.V1, concurrent=True):
self._print(match.group(0))
except sre_constants.error, e:
print
开发者ID:taky,项目名称:textile,代码行数:7,代码来源:textile.py
示例14: prune_by_precision
def prune_by_precision(self, min_precision, text_data_pairs):
"""
Removes patterns from the model that don't reach a minimum precision
:param float min_precision: the minimum precision required of a pattern when applied to the given data
:param collections.Iterable text_data_pairs: an iterable of `(text, data)` pairs where `text` is a string and
`data` is an anafora.AnaforaData object
"""
pattern_scores = collections.defaultdict(lambda: anafora.evaluate.Scores())
for text, data in text_data_pairs:
# collect the spans of each type of reference annotation
reference_type_spans_map = collections.defaultdict(lambda: set())
for annotation in data.annotations:
reference_type_spans_map[annotation.type].add(annotation.spans)
# make predictions with each pattern in the model
for pattern in self.regex_type_attributes_map:
predicted_spans = {((m.start(), m.end()),) for m in regex.finditer(pattern, text)}
if predicted_spans:
predicted_type, _ = self.regex_type_attributes_map[pattern]
# update the scores for this pattern
pattern_scores[pattern].add(reference_type_spans_map[predicted_type], predicted_spans)
# delete any pattern with a precision lower than the minimum requested
for pattern, scores in pattern_scores.items():
if scores.precision() < min_precision:
del self.regex_type_attributes_map[pattern]
开发者ID:bethard,项目名称:anaforatools,代码行数:29,代码来源:regex.py
示例15: process_verses
def process_verses(chap_string, expression):
"""
Take an entire chapters as a string and break up into verses. The new chapter index (number followed by
a space) must be stripped out.
:param chap_string: All verses in a chapter combined as one string.
:param expression: A compiled regular expression with which to find new verses.
:return: A list of strings (jagged array), with each verse as a separate string.
"""
# find all new verses with the regular expression
matches = expression.finditer(chap_string)
# save start position of first verse and initiate list of verses
try:
start = next(matches)
except StopIteration:
return [chap_string]
verses = []
# loop through matches until StopIteration is raised at the last verse
while True:
try:
end = next(matches)
verses.append(chap_string[start.end()-1:end.start()])
start = end
except StopIteration:
verses.append(chap_string[start.end()-1:])
break
# error correction - look for numbers in each verse and compare to verse number
# This will differentiate between incorrectly formatted verses numbers and other numbers in the text.
corrected_verses = []
for index, verse in enumerate(verses):
nums = re.finditer(u'\d{1,3} ', verse)
good = True
for num in nums:
if int(num.group()) - index == 2:
# add first verse
corrected_verses.append(verse[:num.start()])
# edit second verse
second = verse[num.start():]
second = second.replace(num.group(), num.group()[:len(num.group())])
corrected_verses.append(second)
good = False
break
if good:
corrected_verses.append(verse)
# strip out the * marker used to help differentiate numbers and verses
for index, verse in enumerate(corrected_verses):
corrected_verses[index] = verse.replace(u'*', u'')
return corrected_verses
开发者ID:JonMosenkis,项目名称:Sefaria-Data,代码行数:59,代码来源:JPS1985.py
示例16: finditer
def finditer(
pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
partial=False, concurrent=None, **kwargs
):
return regex.finditer(
_apply_search_backrefs(pattern, flags), string,
flags, pos, endpos, overlapped, partial, concurrent, **kwargs
)
开发者ID:lupescu,项目名称:subtxt_pkg,代码行数:8,代码来源:bregex.py
示例17: find_mofit
def find_mofit(s, t):
"""
:return: All locations of t as a substring of s.
:param: s, t Two DNA strings
"""
matches = finditer(t, s, overlapped=True)
ans = [str(element.start() + 1) for element in matches]
return " ".join(ans)
开发者ID:jd901215,项目名称:Rosalind,代码行数:8,代码来源:find_mofit.py
示例18: MultipleApproxPatternMatch
def MultipleApproxPatternMatch(inputset):
patterns, d = inputset
all_matched_index = []
for eachp in patterns:
reg = GetRE(eachp, d)
for m in regex.finditer(reg, text, overlapped = True):
all_matched_index.append(m.start())
return all_matched_index
开发者ID:ajing,项目名称:BioinfoAlgorithm,代码行数:8,代码来源:MultipleApproxPatternMatch.py
示例19: glycocheck
def glycocheck(protein):
ind = []
import regex as re
matches = re.finditer("N[^P][ST][^P]", protein, overlapped=True)
for i in matches:
ind.append(i.start() + 1)
return ind
开发者ID:eignatenkov,项目名称:rosalind,代码行数:8,代码来源:mprt.py
示例20: get_word_groups_from_line
def get_word_groups_from_line(self, song_line):
word_groups = []
# for match in re.finditer(r"(?:^|\s)([A-Za-z,'! &?.]+)(?:\s|$)", song_line):
for match in re.finditer(r"(?:^|\s)([^-0-9;]+)(?:\s|$)", song_line):
# print(song_line + " => '" + m.group(0).strip() + "'")
word_group = Group(match.start(), match.group(0).strip())
word_groups.append(word_group)
return word_groups
开发者ID:JonAWhite,项目名称:choir_tracker,代码行数:9,代码来源:song_list_extractor.py
注:本文中的regex.finditer函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论