本文整理汇总了Python中regex.findall函数的典型用法代码示例。如果您正苦于以下问题:Python findall函数的具体用法?Python findall怎么用?Python findall使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了findall函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: serial_killer_guess
def serial_killer_guess(self):
"""
Implements the Aiden-Michel serial-killer algorithm as described at
http://dx.doi.org/10.1126/science.1199644
http://science.sciencemag.org.ezproxy.neu.edu/content/331/6014/176.figures-only.
I don't think this is likely to be that useful for most users;
it's here to test the algorithm.
"""
titles = set(re.findall(r"\w+",self.title().lower()))
try:
author = set(re.findall("\w+",self.first_author()["first_author_name"].lower()))
except KeyError:
author = set([])
title_blacklist = set(["advances", "almanac", "annual", "bibliography", "biennial", "bulletin", "catalog", "catalogue", "census", "conference", "conferences", "congress", "congressional", "digest", "digest", "directory", "hearings", "index", "journal", "magazine", "meeting", "meetings", "monthly", "papers", "periodical", "proceedings", "progress", "quarterly", "report", "reports", "review", "revista", "serial", "society", "subcommittee", "symposium", "transactions", "volume", "yearbook", "yearly"])
author_blacklist = set(["the", "of", "and", "administration", "congress", "international", "national", "federal", "state", "american", "british", "consortium", "university", "office", "america", "united", "states", "britain", "ireland", "canada", "australia", "institute", "research", "committee", "subcommittee", "court", "association", "foundation", "board", "bureau", "house", "senate", "dept", "department", "state", "council", "club", "school", "network", "online", "company", "co", "us", "u.s.", "survey", "agency", "academy", "commission", "press", "publishing", "publishers", "academic", "cambridge", "sciencedirect", "kluwer", "oxford", "interscience", "library", "on", "society", "service", "affairs", "division", "commerce", "public", "foreign", "government", "agriculture", "science", "engineers", "stanford", "medical", "energy", "laboratory", "economic", "geological", "assembly", "alabama", "alaska", "american", "arizona", "arkansas", "california", "colorado", "connecticut", "delaware", "columbia", "district", "florida", "georgia", "guam", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "hampshire", "jersey", "mexico", "york", "ohio", "oklahoma", "oregon", "pennsylvania", "north", "south", "tennessee", "texas", "utah", "vermont", "wisconsin", "wyoming"])
if len(titles.intersection(title_blacklist)) + len(author.intersection(author_blacklist)):
return "serial"
return "book"
开发者ID:Bookworm-project,项目名称:Bookworm-MARC,代码行数:25,代码来源:bookwormMARC.py
示例2: countRepeats
def countRepeats(read, readReverse):
i = 0
j = 0
global reference
global referenceReverse
i = len(re.findall(str(read), str(reference), overlapped=True))
if read != readReverse:
j = len(re.findall(str(read), str(referenceReverse), overlapped=True))
return i + j
开发者ID:qiuxan,项目名称:mystudy,代码行数:9,代码来源:count.py
示例3: pep_end
def pep_end(pep,seq):
if len(findall(pep,seq)) > 1:
runs = finditer(pep,seq)
coord = []
for match in runs:
coord.append(match.end())
return coord
elif len(findall(pep,seq)) == 1:
return search(pep,seq).end()
else: return 'Not found'
开发者ID:marcottelab,项目名称:Nico,代码行数:10,代码来源:Hs_addseq.py
示例4: pep_end
def pep_end(row):
pep = row['Peptide'].replace('I','J').replace('L','J').replace('J','(I|L)')
seq = row['Sequence']
if len(findall(pep,seq)) > 1:
runs = finditer(pep,seq)
coord = []
for match in runs:
coord.append(match.end())
return coord
elif len(findall(pep,seq)) == 1:
return [search(pep,seq).end()]
else: return 'Not found'
开发者ID:marcottelab,项目名称:Nico,代码行数:12,代码来源:At_addseq.py
示例5: repl
def repl(match):
# e.g. match.group(0) = {% class super %}
unknown = True
directive = match.group(1)
ret = match.group(0)
parts = [p.strip() for p in directive.split(' ') if p.strip()]
if parts:
if parts[0] == 'style':
unknown = False
classes_intersection = None
for term in parts[1:]:
classes = []
if term == 'STRUCKTWICE':
# <span class="T33">aiulfus</span>
classes = regex.findall(ur'<text:span text:style-name="([^"]+)">(?:aiulfus|7 dim)</text:span>', xml_string)
classes = list(set(classes))
elif term == 'PRO':
classes = regex.findall(ur'\s<text:span text:style-name="([^"]+)">p</text:span>\[ro\]\s', xml_string)
classes = list(set(classes))
else:
# find the class with the term <term> (e.g. super)
# <style:style style:name="T3" style:family="text">
# <style:text-properties
# style:text-position="super 58%" />
# </style:style>
for style in regex.findall(ur'(?musi)<style:style style:name="(.*?)"[^>]*>(.*?)</style:style>', xml_string):
if term in style[1]:
classes.append(style[0])
if not classes:
raise Exception('ERROR: style not found "%s"' % term)
if classes_intersection is None:
classes_intersection = classes
else:
# only keep the classes/styles that meet all keywords (AND)
classes_intersection = set(classes).intersection(set(classes_intersection))
# now remove classes which we have already used
already_used_warning = set(classes_intersection).intersection(set(classes_used.keys()))
if already_used_warning:
print '<!-- Already used classes/styles: %s (see above) -->' % ', '.join(list(already_used_warning))
classes_intersection = set(classes_intersection).difference(set(classes_used.keys()))
# update the classes_used
for cls in classes_intersection:
classes_used[cls] = 1
ret = ' or '.join([ur"@text:style-name='%s'" % cls for cls in classes_intersection])
print '<!-- %s => %s -->' % (parts, ret)
开发者ID:MCadeStewart,项目名称:digipal,代码行数:51,代码来源:dpxml.py
示例6: pep_start
def pep_start(row):
#correct leucine-isoleucine insensibility
pep = row['Peptide'].replace('I','J').replace('L','J').replace('J','(I|L)')
seq = row['Sequence']
#if it matches more than once, return a list of positions
if len(findall(pep,seq)) > 1:
runs = finditer(pep,seq)
coord = []
for match in runs:
coord.append(match.start()+1)
return coord
elif len(findall(pep,seq)) == 1:
return [search(pep,seq).start()]
else: return 'Not found'
开发者ID:marcottelab,项目名称:Nico,代码行数:14,代码来源:At_addseq.py
示例7: get_output_extension
def get_output_extension(file):
pattern = str(file).lower()
full_ext = re.findall(r'(\..+)+$',pattern)
if full_ext:
full_ext.reverse()
full_ext = full_ext[0]
exts = re.findall(r'\.[^.]*',full_ext)
exts.reverse()
return exts[1] if len(exts) > 1 else exts[0]
return ''
开发者ID:OiNutter,项目名称:rivets,代码行数:14,代码来源:extensions.py
示例8: getCoverageProblems
def getCoverageProblems(self):
"""Verify that each rule and each exclusion has the right number of tests
that applies to it. TODO: Also check that each target has the right
number of tests. In particular left-wildcard targets should have at least
three tests. Right-wildcard targets should have at least ten tests.
Returns an array of strings reporting any coverage problems if they exist,
or empty list if coverage is sufficient.
"""
problems = self._determineTestApplication()
# Next, make sure each rule or exclusion has sufficient tests.
for rule in self.rules:
needed_count = 1 + len(regex.findall("[+*?|]", rule.fromPattern))
# Don't treat the question mark in non-capturing and lookahead groups as increasing the
# number of required tests.
needed_count = needed_count - len(regex.findall("\(\?:", rule.fromPattern))
needed_count = needed_count - len(regex.findall("\(\?!", rule.fromPattern))
needed_count = needed_count - len(regex.findall("\(\?=", rule.fromPattern))
# Don't treat escaped questions marks as increasing the number of required
# tests.
needed_count = needed_count - len(regex.findall("\\?", rule.fromPattern))
actual_count = len(rule.tests)
if actual_count < needed_count:
problems.append("%s: Not enough tests (%d vs %s) for %s" % (
self.filename, actual_count, needed_count, rule))
pass
for exclusion in self.exclusions:
needed_count = 1 + len(regex.findall("[+*?|]", exclusion.exclusionPattern))
needed_count = needed_count - len(regex.findall("\(\?:", exclusion.exclusionPattern))
needed_count = needed_count - len(regex.findall("\\?", rule.fromPattern))
actual_count = len(exclusion.tests)
if actual_count < needed_count:
problems.append("%s: Not enough tests (%d vs %s) for %s" % (
self.filename, actual_count, needed_count, exclusion))
return problems
开发者ID:KurtKrampmeier,项目名称:https-everywhere,代码行数:35,代码来源:rules.py
示例9: getWordMatrix
def getWordMatrix(word,model,padToMaxLength = None):
phonemes_alone="pbmfv84tdszcnSZCjT5kgxNqGX7hlLwyr!ieaouE3"
phonemeSearchRegex = "["+phonemes_alone+"][\"\*]?(?!["+phonemes_alone+"]~|["+phonemes_alone+"]{2}\$)|["+phonemes_alone+"]{2}?~|["+phonemes_alone+"]{3}?\$"
phonemes = regex.findall(phonemeSearchRegex, word)
wordVector = []
for phoneme in phonemes:
#if phoneme not in model, get single chars as phonemes instead
if phoneme not in model:
for ph in regex.findall("["+phonemes_alone+"]", phoneme):
wordVector.append(model[ph])
else:
wordVector.append(model[phoneme])
if padToMaxLength:
return np.pad(np.array(wordVector),((0,padToMaxLength - len(wordVector)),(0,0)),mode="constant")
return wordVector
开发者ID:marlonbetz,项目名称:cognet,代码行数:15,代码来源:__init__.py
示例10: main
def main():
gitGrep = subprocess.Popen(["git", "grep", "(\s*http.*)"],
stdout=subprocess.PIPE)
for line in gitGrep.stdout:
urls = regex.findall(URL_REGEX, line)
for url in urls:
gQueue.put(url[1])
for i in range(NUM_WORKER_THREADS):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
# block until we've finished all our jobs
gQueue.join()
# finally output each file and the corresponding URLs to fix/remove
for key in gProcessed:
print("%s has the following changes needed:" % (
colored(key, 'yellow')))
for url in gProcessed[key]:
print("- %s" % url)
# an empty dict evaluates as False, so we can take advantage of that to
# return 0 when we have no matches, and 1 when there are matches
sys.exit(gEncounteredErrors)
开发者ID:svett,项目名称:dev-resources,代码行数:27,代码来源:checklinks.py
示例11: extract_features
def extract_features(name):
def find(reg, str):
res = regex.findall(reg, str, regex.I)
if res:
return '|'.join(sorted(res))
else:
return None
return {
'length': len(name),
'tokens': len(regex.findall('[\w\']+', name)),
'resolution': find('(720|1080)', name),
'quality': find('(SDTV|HDTV|PDTV|WEB-?DL|WEBRIP|XVID|DIVX|DVDR|DVD-RIP|x264|dvd|XvidHD|AVC|AAC|VC\-?1|wmvhd|web\-dl|BRRIP|HDRIP|HDDVD|bddvd|BDRIP|webscr|bluray|bd?25|bd?50|blu-ray|BDREMUX)', name),
'3d': bool(find('(3D)', name)),
'subgroup': find('\[(\w+)\]', name),
'filehash': bool(find('\[([0-9a-fA-F]{8})\]', name)),
'season': bool(find('(S\d{1,2})', name)),
'episode': bool(find('(E\d{1,2})', name)),
'airdate': bool(find('((?:\d{4}[.-/ ]\d{2}[.-/ ]\d{2})|(?:\d{2}[.-/ ]\d{2}[.-/ ]\d{4}))', name)),
'year': bool(find('[.-/ ](\d{4})[.-/ ]', name)),
'versus': bool(find('[.-/ ](vs?)[.-/ ]', name)),
'music': bool(find('((?:^VA(?:\-|\_|\ ))|(?:MP3|VBR|NMR|CDM|FLAC|\-(?:CDR?|EP|LP|SAT|2CD|FM|VINYL|DE|CABLE|TAPE)\-))', name)),
'ebook': bool(find('(e?\-?book|html|epub|pdf|mobi|azw|doc|isbn)', name)),
'comic': bool(find('(cbr|cbz)', name)),
'magazine': bool(find('(mag(?:s|azine?s?))', name)),
'sport': find('(epl|motogp|bellator|supercup|wtcc|bundesliga|uefa|espn|wwe|wwf|wcw|mma|ucf|fia|pga|nfl|ncaa|fifa|mlb|nrl|nhl|afl|nba|wimbledon|cricket)[\. -_]', name),
'xxx': bool(find('(xxx|imageset|porn|erotica)', name)),
'game': find('(PS3|3DS|NDS|PS4|XBOX|XBONE|WII|DLC|CONSOLE|PSP|X360|PS4)', name),
'foreign': bool(find('(seizoen|staffel|danish|flemish|dutch|Deutsch|nl\.?subbed|nl\.?sub|\.NL|\.ITA|norwegian|swedish|swesub|french|german|spanish|icelandic|finnish|Chinese\.Subbed|vostfr|Hebrew\.Dubbed|\.HEB\.|Nordic|Hebdub|NLSubs|NL\-Subs|NLSub|Deutsch| der |German | NL |\.PL\.)', name)),
'pc': bool(find('((?:v?\d\.\d\.)|(?:x64|32bit|64bit|exe))', name)),
'documentary': bool(find('(documentary|national geographic|natgeo)', name))
}
开发者ID:Murodese,项目名称:pynab,代码行数:32,代码来源:categories.py
示例12: idhit
def idhit(self,
id,
seq,
primerDict
):
for primer in primerDict:
#match=regex.findall("(%s){e<=5}" % (primer), str(seq))
## checks if more than one match!
#if len(match)==1:
#ind = str(seq).index(match[0])
a={id: regex.findall("(%s){e<=5}" % (primer), str(seq))}
a={key:item for key, item in a.iteritems() if item}
for key,item in a.iteritems():
if len(item)>1:
best_match=[difflib.SequenceMatcher(None, x, primer).ratio() for x in item]
best=item[ best_match.index(max(best_match)) ]
ind = str(seq).index(best) + len(best)
else:
best=item[0]
ind = str(seq).index(best) + len(best)
if len(str(seq)[ind+20:ind+270])==250:
self.f250[id] = str(seq)[ind+20:ind+270]
self.f200[id] = str(seq)[ind+20:ind+220]
self.f210[id] = str(seq)[ind+20:ind+230]
self.f220[id] = str(seq)[ind+20:ind+240]
self.f230[id] = str(seq)[ind+20:ind+250]
self.f240[id] = str(seq)[ind+20:ind+260]
return True
else:
continue
return False
开发者ID:JoshDaly,项目名称:scriptShed,代码行数:35,代码来源:fragment_fuzzy-josh-tmp.py
示例13: call_C_primer
def call_C_primer(C_seq, C_primer_match_dict):
primer_isotype = 'no_primer'
primer_position = -1
C_seq_without_primer = C_seq
C_seq_length = len(C_seq)
# series of ifs necessary for correct execution (as opposed to if / else)
for primer_seq, primer_name in C_primer_match_dict.items():
primer_position = C_seq[-25:].find(primer_seq) # look only in the last 25nt for the primer seq
if primer_position == -1: # try to find a shortened version of the primer (for 12nt barcodes as opposed to 8)
primer_position = C_seq[-25:].find(primer_seq[:-4])
if primer_position == -1: # still no exact primer match, moving to fuzzy primer search
match = regex.findall('(%s){e<=2}'%primer_seq[:-4],C_seq[-25:],regex.BESTMATCH) # fuzzy match allowing 2 errors
if match != []:
primer_position = C_seq[-25:].find(match[0]) # regex.findall returns a list
if primer_position != -1:
primer_isotype = primer_name
# primer search is done on only the last 30nt of the constant
# sequence, thus the match location must be reindexed relative to
# the entire constant sequence
adj_primer_pos = primer_position + C_seq_length - 25
C_seq_without_primer = C_seq[:adj_primer_pos]
break
return primer_isotype, C_seq_without_primer
开发者ID:felixhorns,项目名称:BCellClassSwitching,代码行数:25,代码来源:parse_igblast.py
示例14: getString
def getString(self,word):
#turn every vowel into generic "V"
v = "[aeiouE3]"
word = regex.sub(v,"V",word)
phonemes_alone="pbmfv84tdszcnSZCjT5kgxNqGX7hlLwyr!V"
phonemeSearchRegex = "["+phonemes_alone+"][\"\*]?(?!["+phonemes_alone+"]~|["+phonemes_alone+"]{2}\$)|["+phonemes_alone+"]{2}?~|["+phonemes_alone+"]{3}?\$"
phonemes = regex.findall(phonemeSearchRegex, word)
word_new = ""
for phoneme in phonemes:
#if phoneme not in model, get single chars as phonemes instead
if phoneme not in self.phoneme_feature_dict:
for ph in regex.findall("["+phonemes_alone+"]", phoneme):
word_new += ph
else:
word_new += phoneme
return word_new
开发者ID:marlonbetz,项目名称:BA,代码行数:16,代码来源:binary_phoneme_features.py
示例15: parse_readme
def parse_readme(self, rules):
filepath = os.path.join(self.scapegoat_dir, 'README.md')
if not os.path.isfile(filepath):
raise Exception('Could not open "%s"' % filepath)
f = open(filepath, 'r')
content = f.read()
f.close()
m = regex.search(r'\n### Inspections(.*)\n### ', content, regex.DOTALL)
if m is None:
return
blocks = m.group(1).split('#####')
for block in blocks:
block = block.strip()
if '|Name|' in block:
mx = regex.findall(r'\n\|([^\|]+)\|([^\|]*)\|', block)
for m in mx:
key = m[0].strip()
wikitext = m[1].strip()
if key and key[0] != '-' and key != 'Name':
if key not in rules:
rules[key] = {}
rules[key]['wikitext'] = wikitext
else:
(key, desc) = block.split('\n', 1)
key = ''.join(word[0].upper() + word[1:] for word in key.split())
desc = desc.strip()
if key not in rules:
rules[key] = {}
rules[key]['description'] = desc
if not len(rules) > 0:
raise Exception('invalid scapegoat readme')
开发者ID:arthepsy,项目名称:sonar-scala-extra,代码行数:32,代码来源:scapegoat_rules_builder.py
示例16: get_stats_from_xml_string
def get_stats_from_xml_string(xml_string, text_label="", stats=None):
# print 'Count - Tag'
# print
els = {}
if stats is not None:
els = stats
import regex as re
# elements = re.findall(ur'<(\w+)', xml_string)
# for el in set(elements):
# print '%8d %s' % (elements.count(el), el)
# print
# print 'Unique tag-attributes'
# print
elements = re.findall(ur"<([^/!?>][^>]*)>", xml_string)
for el in elements:
el = el.strip()
if el not in els:
els[el] = {"text": text_label, "count": 1}
else:
els[el]["count"] += 1
return els
开发者ID:kcl-ddh,项目名称:digipal,代码行数:26,代码来源:utils.py
示例17: update_post
def update_post(username, slug):
user = current_user
content = request.form.get('content', type=str)
cursor = request.form.get('cursor', type=int)
if content is not None:
post = user.posts.filter_by(slug=slug).first()
if post:
post.cursor = len(content) if not cursor else cursor
post.modified_timestamp = datetime.utcnow()
# Get meta
r = regex.compile(r'<<((?:(?>[^<>]+)|<(?!<)|>(?!>))*?)>>', regex.I | regex.S)
post.meta = json.dumps(regex.findall(r, content))
# Encrypt
half_key = session[generate_hash(user.user_key_salt)]
key = xor_keys(half_key, app.config['MASTER_KEY'])
content = snappy.compress(content)
content = AES_encrypt(key, user.username, content)
post.content = content
db.session.add(post)
db.session.commit()
return jsonify(error=None)
return jsonify(error="Not found")
elif cursor is not None:
post = user.posts.filter_by(slug=slug).first()
if post:
post.cursor = cursor
db.session.add(post)
db.session.commit()
return jsonify(error=None)
return jsonify(error="Not found")
return jsonify(error="Invalid parameters")
开发者ID:kevinisaac,项目名称:journal,代码行数:35,代码来源:views.py
示例18: get_random_dna
def get_random_dna(length, max_repeat_nuc=float('inf'), invalid_patterns=None):
'''Returns a random sequence of DNA of the supplied length,
while adhering to a maximum number of repeating nucleotides.'''
max_attempts = 1000
attempts = 0
if invalid_patterns is None:
invalid_patterns = []
while True:
attempts += 1
if attempts > max_attempts:
raise ValueError('Unable to optimise sequence. ' +
'Greater than ' + str(max_repeat_nuc) +
' repeating nucleotides.')
random_dna = _get_random_dna(length)
valid = True
for invalid_pattern in invalid_patterns:
if len(re.findall(invalid_pattern, random_dna, overlapped=True)) \
> 0:
valid = False
if valid and is_valid(random_dna, max_repeat_nuc):
return random_dna
return None
开发者ID:synbiochem,项目名称:synbiochem-py,代码行数:30,代码来源:seq_utils.py
示例19: tokenize2
def tokenize2(text):
"""uses the letters to break the text into words
returns a list of words"""
# words = re.findall('[a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
# words = re.findall('\w+', text)
words = re.findall('\p{L}+', text)
return words
开发者ID:pnugues,项目名称:ilppp,代码行数:7,代码来源:tokenizer.py
示例20: findall_p_in_s
def findall_p_in_s(p,s):
""""returns a series of matches for a pattern (p) in a str (s)"""""
match_strs = regex.findall(p,s)
#get pairs of left and right indexes
match_indexes = [(i.start(0),i.end(0)) for i in regex.finditer(p,s)]
all_p_in_s = [Match(match_strs[i],match_indexes[i][0],match_indexes[i][1]) for i in range(0,len(match_strs))]
return all_p_in_s
开发者ID:apathinwalking,项目名称:tidyaddr,代码行数:7,代码来源:match.py
注:本文中的regex.findall函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论