本文整理汇总了Python中re.sub函数的典型用法代码示例。如果您正苦于以下问题:Python sub函数的具体用法?Python sub怎么用?Python sub使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sub函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _parse_productions
def _parse_productions(self):
"""
Parse the current contents of the textwidget buffer, to create
a list of productions.
"""
productions = []
# Get the text, normalize it, and split it into lines.
text = self._textwidget.get('1.0', 'end')
text = re.sub(self.ARROW, '->', text)
text = re.sub('\t', ' ', text)
lines = text.split('\n')
# Convert each line to a CFG production
for line in lines:
line = line.strip()
if line=='': continue
productions += parse_cfg_production(line)
#if line.strip() == '': continue
#if not CFGEditor._PRODUCTION_RE.match(line):
# raise ValueError('Bad production string %r' % line)
#
#(lhs_str, rhs_str) = line.split('->')
#lhs = Nonterminal(lhs_str.strip())
#rhs = []
#def parse_token(match, rhs=rhs):
# token = match.group()
# if token[0] in "'\"": rhs.append(token[1:-1])
# else: rhs.append(Nonterminal(token))
# return ''
#CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
#
#productions.append(Production(lhs, *rhs))
return productions
开发者ID:sfu-natlang,项目名称:nltk,代码行数:35,代码来源:cfg.py
示例2: getPanelInfo
def getPanelInfo(self, doc, strXPath):
try:
npos = doc.text_content().find(strXPath)
if npos == -1:
return ""
strContent = doc.text_content()[npos:-1]
npos = strContent.find("})")
if npos == -1:
return ""
strContent = strContent[0:npos+1]
strContent = (strContent[strContent.find("\"html\":\"")+8:-4])
if "v2" in self.xpathType:
strContent = strContent.decode('unicode-escape')
strContent = re.sub(r"(\\n)*(\\t)*(\\ /)*(\\)*", "", strContent)
strContent = re.sub(r"\\/", "/", strContent)
if strContent:
strContent = strContent.replace("<", "<").replace(">", ">").replace("nbsp;", "")
else:
return ""
except Exception:
s=sys.exc_info()
msg = (u"getPanelInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno))
logger.error(msg)
return ""
return strContent
开发者ID:CnPaMeng,项目名称:WeiboMsgBackupGUI,代码行数:25,代码来源:msgcomcrawler.py
示例3: GetBook
def GetBook(self, book):
self.footnotes=[]
self.content=[]
counter = 1
plainBook = unicodeToPlain(book)
while True:
url='http://www.biblia.deon.pl/otworz.php'
values={'ksiega': book.encode('iso8859_2'),
'rozdzial': str(counter)}
data=urllib.urlencode(values)
response = urllib2.urlopen(urllib2.Request(url, data)).read()
doc = html.fromstring(response)
if counter == 1:
BookTitle = (doc.findall('.//span[@style="font-size:22px;"]')[0])
self.content.append(re.sub(r'</span>', r'</div>', re.sub(r'<span style=\"font-size:22px;\"',r'<br><br><a name="K' + plainBook + r'"></a><div class="tytul"', html.tostring(BookTitle))))
ChaptersInBook = len(doc.findall('.//select[@name="rozdzial"]/option'))
else:
self.content.append('<br><br>')
plainPrefix = plainBook + str(counter)
self.content.append('<div class="numer">' + str(counter) + '</div>')
Book.GetContent(self, doc.xpath('//div[@class="tresc"]')[0], plainPrefix)
Book.GetFootnotes(self, doc.xpath('//td[@width="150"]/table/tr[5]/td/div[1]')[0], plainPrefix, unicodeToReference(book) + ' ' + str(counter))
if counter == ChaptersInBook:
self.content.append('<br><br>' + "".join(self.footnotes))
break
counter += 1
开发者ID:kamm,项目名称:project_b,代码行数:29,代码来源:project_b.py
示例4: extract_bow_v2_features
def extract_bow_v2_features(train, test, test_contains_labels = False):
'''
Performs feature extraction for another simple tfidf model used for
ensembling purposes.
'''
s_data = []
s_labels = []
t_data = []
t_labels = []
stemmer = PorterStemmer()
for i, row in train.iterrows():
s=(" ").join(["q"+ z for z in BeautifulSoup(train["search_term"][i], "lxml").get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i], "lxml").get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i], "lxml").get_text(" ")
s=re.sub("[^a-zA-Z0-9]"," ", s)
s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
s_data.append(s)
s_labels.append(str(train["relevance"][i]))
for i, row in test.iterrows():
s=(" ").join(["q"+ z for z in BeautifulSoup(test["search_term"][i], "lxml").get_text().split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(test.product_title[i], "lxml").get_text().split(" ")]) + " " + BeautifulSoup(test.product_description[i], "lxml").get_text()
s=re.sub("[^a-zA-Z0-9]"," ", s)
s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
t_data.append(s)
if test_contains_labels:
t_labels.append(str(test["relevance"][i]))
return (s_data, s_labels, t_data, t_labels)
开发者ID:pringlesLion,项目名称:mycs231n,代码行数:26,代码来源:extract.py
示例5: gen_xkcd_sub
def gen_xkcd_sub(msg, hook=False):
# http://xkcd.com/1288/
substitutions = {
'witnesses': 'these dudes I know',
'allegedly': 'kinda probably',
'new study': 'tumblr post',
'rebuild': 'avenge',
'space': 'SPAAAAAACCCEEEEE',
'google glass': 'virtual boy',
'smartphone': 'pokedex',
'electric': 'atomic',
'senator': 'elf-lord',
'car': 'cat',
'election': 'eating contest',
'congressional leaders': 'river spirits',
'homeland security': 'homestar runner',
'could not be reached for comment': 'is guilty and everyone knows it'
}
# http://xkcd.com/1031/
substitutions['keyboard'] = 'leopard'
# http://xkcd.com/1418/
substitutions['force'] = 'horse'
output = msg
if not hook or random() < 0.001 or True:
for text, replacement in substitutions.items():
if text in output:
output = re.sub(r"\b%s\b" % text, replacement, output)
output = re.sub(r'(.*)(?:-ass )(.*)', r'\1 ass-\2', output)
if msg == output:
return None if hook else msg
else:
return output
开发者ID:tjcsl,项目名称:cslbot,代码行数:33,代码来源:textutils.py
示例6: classifyText
def classifyText( text, params ):
start_time = params.my_time()
#clean
try: text = params.cleaner.clean_html( text )
except: pass
text = re.sub('<.*?>', ' ', text )
text = re.sub('\s+', ' ', text )
text = text.lower()
#Tokenize
tokens = re.findall('[a-z]+', text )
#Remove stop words
tokens_2 = []
for t in tokens:
if( not t in params.stopword_list ): tokens_2.append(t)
# print tokens_2
#Stem
stems = []
for t in tokens_2:
stem = params.porterStemmer.stem( t, 0, len(t)-1 )
stems.append(stem)
z = 0#params.linear_classifier['{{intercept}}']+.6
for s in stems:
if s in params.linear_classifier:
# print s, params.linear_classifier[s]
z += params.linear_classifier[s]
end_time = params.my_time()
return ( z<0, [start_time, end_time, len(stems), z, 1/(1+math.exp(-z)), int(z>0)] )
开发者ID:dustin-stoltz,项目名称:snowcrawl,代码行数:34,代码来源:test3_processor.py
示例7: htmlify
def htmlify (self, text):
t=text.strip()
#t=xml.sax.saxutils.escape(t)
t="<p>%s</p>"%t
t=re.sub('\n\n+','</p><p>',t)
t=re.sub('\n','<br>',t)
return t
开发者ID:ChrisWellington,项目名称:gourmet,代码行数:7,代码来源:html_exporter.py
示例8: convert_corpus
def convert_corpus(filepath, mapping, alignment, begin="xxBeGiN142xx", end="xxEnD142xx"):
general_corpus = ''
with open(filepath, 'rb') as f:
general_corpus = re.sub('(' + begin + '\W+)+', ' . ', f.read())
general_corpus = re.sub('\n+', ' this_is_n3wline ', general_corpus)
corpus = []
for token in general_corpus.split():
if token.strip() == '.':
if len(corpus) > 0:
if '\n' not in corpus[-1]:
# If the token is punctuation assign a random punctuation.
corpus[-1] = corpus[-1] + random.choice(['.', '.', '.' , ',', ',' ',', '!', '?'])
elif token.strip() == 'this_is_n3wline':
corpus[-1] = corpus[-1] + '.\n\n'
elif alignment[token] in mapping:
if len(corpus) > 0 and re.search('[\n\.!?]',corpus[-1]):
corpus.append(mapping[alignment[token]].capitalize().strip())
else:
corpus.append(mapping[alignment[token]].strip())
corpus[0] = corpus[0].capitalize()
output = ' '.join(corpus)
output = re.sub(r' +', ' ', output)
output = re.sub(r'\n+ ', '\n\n', output)
return output
开发者ID:lizrush,项目名称:shorties,代码行数:25,代码来源:convert_vocab.py
示例9: getCategoryUrl
def getCategoryUrl(site="",url=""):
catDb = openTable(tableName=global_setting['catTable'])
r = session.get(url)
if not r.text:
return False
soup = BeautifulSoup(r.text)
for level1 in soup.select('.classify_books'):
curLevel1 = level1.select('.classify_title')[0].text
curLevel1 = re.sub('\s', '', curLevel1)
for level2 in level1.select('.classify_kind'):
curLevel2 = level2.select('.classify_kind_name')[0].text
curLevel2 = re.sub('\s', '', curLevel2)
for level3 in level2.select('ul li a'):
#curLevel3 = re.sub('\s', '', level3.text)
curLevel3 = level3.text.strip()
curlUrl = level3['href']
retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
if retFind:
curCatID = retFind[0]
catType = 'book'
else:
retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
if retFind:
curCatID = retFind[0]
catType = 'nonbook'
if retFind:
if catDb.find({'catId':curCatID}).count() >0:
logger.debug('catetogy %s exists,skip\n'%(curCatID))
else:
catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
return True
开发者ID:Neilfu,项目名称:NLP,代码行数:32,代码来源:getEcommence_dangdang.py
示例10: obfuscate_codeblocks
def obfuscate_codeblocks(source):
"""Method for obfuscating codeblocks contents.
It can be often useful to temporarly obfuscate codeblocks contents for performing safely some tasks
and then re-introducing them.
Parameters
----------
source : str
string (as single stream) containing the source
Returns
-------
protected_contents : list
list of str containing the contents of codeblocks
str
source with codeblocks contents obfuscated and replaced by a safe placeholder
>>> source = '``` my code block ``` other contents'
>>> prot, ob_source = obfuscate_codeblocks(source)
>>> prot[0][2]
'``` my code block ```'
>>> ob_source
'$PROTECTED-1 other contents'
"""
obfuscate_source = source
protected_contents = []
for match in re.finditer(__regex_codeblock__,obfuscate_source):
protected_contents.append([match.start(),match.end(),match.group()])
obfuscate_source = re.sub(__regex_codeblock__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
for match in re.finditer(__regex_codeblock_html__,obfuscate_source):
protected_contents.append([match.start(),match.end(),match.group()])
obfuscate_source = re.sub(__regex_codeblock_html__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
return protected_contents,obfuscate_source
开发者ID:nunb,项目名称:MaTiSSe,代码行数:34,代码来源:source_editor.py
示例11: makeIdentifier
def makeIdentifier(self, string):
string = re.sub( r"\s+", " ", string.strip())
string = unicodedata.normalize('NFKD', safeEncode(string))
string = re.sub(r"['\"[email protected]#$&%^*\(\)_+\.,;:/]","", string)
string = re.sub(r"[_ ]+","_", string)
string = string.strip('_')
return string.strip().lower()
开发者ID:RussianPlex,项目名称:WikipediaRu,代码行数:7,代码来源:__init__.py
示例12: _sanitize
def _sanitize(self, data):
retv = ''
if data.find('\x1b') != -1:
tmp = filter(lambda x: x in string.printable, data)
retv += re.sub('(\{|\}|\*|\%)', '', re.sub('\[[0-9\;]+m', '', tmp))
return retv
return data
开发者ID:fuzzy,项目名称:abused,代码行数:7,代码来源:emerge.py
示例13: _clean_text
def _clean_text(self, text):
""" Cleans up text before we make it into an HTML tree:
1. Nukes <![CDATA stuff.
2. Nukes XML encoding declarations
3. Replaces </br> with <br/>
4. Nukes invalid bytes in input
5. ?
"""
# Remove <![CDATA because it causes breakage in lxml.
text = re.sub(r"<!\[CDATA\[", u"", text)
text = re.sub(r"\]\]>", u"", text)
# Remove <?xml> declaration in Unicode objects, because it causes an error:
# "ValueError: Unicode strings with encoding declaration are not supported."
# Note that the error only occurs if the <?xml> tag has an "encoding"
# attribute, but we remove it in all cases, as there's no downside to
# removing it. This moves our encoding detection to chardet, rather than
# lxml.
if isinstance(text, unicode):
text = re.sub(r"^\s*<\?xml\s+.*?\?>", "", text)
# Fix </br>
text = re.sub("</br>", "<br/>", text)
# Fix invalid bytes (http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python)
text = re.sub(u"[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+", "", text)
return text
开发者ID:m4h7,项目名称:juriscraper,代码行数:28,代码来源:AbstractSite.py
示例14: sendGPS
def sendGPS(self, head, cmd):
maxcnt = 100
string = head + cmd
res = ""
print "GPS SEND: '%s'" % string
self.dev.flushInput()
self.dev.write(string + "\r\n")
for j in xrange(maxcnt):
res = self.dev.readline()
if len(res) > 0:
res = re.sub("^\s+", "", res)
res = re.sub("\s+$", "", res)
print "RAW GPS REPLY: '%s'" % res
pos = res.find(head)
if pos != -1:
res = res[pos:].split("*")[0]
print "GPS REPLY: '%s'" % res
return res
else:
print "ZERO REPLY"
return None
print "sendGPS: FAILED: '%s'" % res
开发者ID:ghalfacree,项目名称:clock-tamer,代码行数:26,代码来源:tamerdevice.py
示例15: main
def main():
cur_dir = os.path.dirname(__file__)
os.chdir(os.path.join(cur_dir, ".."))
modules = sys.argv[1:]
if not modules:
modules = ['django_evolution']
p = subprocess.Popen(['pyflakes'] + modules,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
close_fds=True)
contents = p.stdout.readlines()
# Read in the exclusions file
exclusions = {}
fp = open(os.path.join(cur_dir, "pyflakes.exclude"), "r")
for line in fp.readlines():
if not line.startswith("#"):
exclusions[line.rstrip()] = 1
fp.close()
# Now filter thin
for line in contents:
line = line.rstrip()
test_line = re.sub(r':[0-9]+:', r':*:', line, 1)
test_line = re.sub(r'line [0-9]+', r'line *', test_line)
if test_line not in exclusions:
print line
开发者ID:Aeron,项目名称:django-evolution,代码行数:32,代码来源:run-pyflakes.py
示例16: start
def start(self):
keyword = getattr(settings, 'USHAHIDI_KEYWORD', '')
self.default_response = getattr(settings, 'USHAHIDI_RESPONSE', 'Thank you for your report.')
self.error_response = getattr(settings, 'USHAHIDI_ERROR', "Due to some error, we're unable to process your message. Please resend.")
self.pattern = re.compile(r"^\s*(?:%s)(?:[\s,;:]+(.+))?$" % (keyword))
self.trigger_url = re.sub('\$\{sender_number\}', '%(sender)s', getattr(settings, 'USHAHIDI_TRIGGER_URL'))
self.trigger_url = re.sub('\$\{message_content\}', '%(message)s', self.trigger_url)
开发者ID:timbaobjects,项目名称:rapidsms-ushahidi-app,代码行数:7,代码来源:app.py
示例17: copy_template
def copy_template():
config_prompt(template)
shutil.copytree(template, name)
if os.path.exists('%s/%s' % (name, 'config.yaml')):
os.remove('%s/%s' % (name, 'config.yaml'))
for dirname, dirnames, files in os.walk(name):
for d in dirnames:
if d == options.template:
shutil.copytree('%s/%s' % (dirname, d), '%s/%s' % (dirname, name))
shutil.rmtree('%s/%s' % (dirname, d))
for dirname, dirnames, files in os.walk(name):
for filename in files:
f = open('%s/%s' % (dirname, filename), 'r')
lines = f.readlines()
f.close()
first_pass = [re.sub('{{\s*(\w+)\s*}}', replace_variable, line) for line in lines]
new_lines = [re.sub('__config_(\w+)__', replace_variable, line) for line in first_pass]
f = open('%s/%s' % (dirname, filename), 'w')
f.write(''.join(new_lines))
f.close()
开发者ID:rlayte,项目名称:kickstart,代码行数:25,代码来源:kickstart.py
示例18: parse_list
def parse_list(self, page):
# Remove null bytes
page = re.sub(r'\0', r' ', page)
# Remove sequences of '''''''
page = re.sub(r"'+", "'", page)
reader = csv.DictReader(StringIO(page), quoting=csv.QUOTE_ALL, escapechar='\\')
# There is one row in the data for each violation, not just each
# inspection. Violations from the same inspection will be contiguous,
# so roll up the violations until we see a different inspection.
current_record = None
for row in reader:
if row['CITY'] != 'CHARLOTTE':
continue
row['comments'] = []
# Strip any leading zeros. Both 01 and 1 appear sometimes, but
# they mean the same thing.
item_id = row['ITEM_NUM'].lstrip('0')
violation = {'id': item_id, 'value': row['ITEM_VALUE'], 'comment': row['COMMENT']}
if current_record is None:
current_record = row
current_record['violation'] = [violation]
elif current_record['FAC_NAME'] != row['FAC_NAME'] or current_record['DATE'] != row['DATE']:
yield current_record
current_record = row
current_record['violation'] = [violation]
else:
current_record['violation'].append(violation)
# The final record won't be yielded from the loop above because it has
# no following record to trigger it, so yield it here.
yield current_record
开发者ID:frankk00,项目名称:openblock,代码行数:30,代码来源:retrieval.py
示例19: clean_word
def clean_word(word):
"""Removes any potential non-word characters"""
word = re.sub("[0-9]* ", "", word)
word = re.sub("[\s]*", "", word)
word = word.replace('\n', '')
word = word.replace('\r', '')
return word
开发者ID:vincent-ferotin,项目名称:PhiloLogic4-WSGI,代码行数:7,代码来源:format.py
示例20: main
def main(argv):
(type, address, action, key, value) = parse_cli()
base_url = ''
address = re.sub('http://', '', address)
if type == 'etcd':
base_url = address + '/v2/keys/'
elif type == 'consul':
base_url = address + '/v1/kv/'
base_url = re.sub('\/+', '/', base_url)
base_url = 'http://' + base_url
base_url = re.sub('\/+$', '', base_url)
if action.lower() == 'set':
set_key_value(base_url, key, value)
elif action.lower() == 'get':
value = parse_value(get_key_value(base_url, key), type)
if value is not None:
print value
elif action.lower() == 'delete':
delete_key_value(base_url, key)
开发者ID:amos6224,项目名称:ami-roles,代码行数:25,代码来源:key_value.py
注:本文中的re.sub函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论