本文整理汇总了Python中nltk.clean_html函数的典型用法代码示例。如果您正苦于以下问题:Python clean_html函数的具体用法?Python clean_html怎么用?Python clean_html使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clean_html函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self, fname):
try:
with open(fname, "r") as f:
log.info("Process %s" % fname)
soup = BeautifulSoup(f.read())
tbl = soup.find("table", { "class" : "cable" })
docid = tbl.findAll('tr')[1].\
findAll('td')[0].contents[1].contents[0]
if docid in self.docids:
return True
doc = {
"_id": docid,
"refererence_id": docid,
"date_time": tbl.findAll('tr')[1].\
findAll('td')[1].contents[1].contents[0],
"classification": tbl.findAll('tr')[1].\
findAll('td')[2].contents[1].contents[0],
"origin": tbl.findAll('tr')[1].\
findAll('td')[3].contents[1].contents[0],
"header":nltk.clean_html(str(soup.findAll(['pre'])[0])),
"body": nltk.clean_html(str(soup.findAll(['pre'])[1]))
}
return doc
except OSError:
log.error("Can't open '%s'" % fname)
self.processed -= 1
开发者ID:benoitc,项目名称:cablesgate,代码行数:30,代码来源:cablesgate.py
示例2: extract_content
def extract_content(self,raw):
logging.info('Processor.extract_content')
soup = BeautifulSoup(raw)
cable_table = soup.find("table", { "class" : "cable" })
cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\
.contents[1].contents[0]
if db.cables.find_one({'_id':cable_id}):
self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1
logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]')
self.print_counts()
return
cable = Cable(raw)
cable['_id'] = cable_id
cable['reference_id'] = cable_id
cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\
.contents[1].contents[0]
cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\
.contents[1].contents[0]
cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\
.contents[1].contents[0]
cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0]))
cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1]))
db.cables.insert(cable.get())
self.counts['files_processed'] = self.counts['files_processed'] + 1
self.print_counts()
if (self.counts['files_processed'] + self.counts['files_not_processed'])\
== self.counts['files_to_process']:
self.dump_json()
开发者ID:anarchivist,项目名称:cablegate,代码行数:34,代码来源:process.py
示例3: scrape_links_and_wordlistify
def scrape_links_and_wordlistify(links, lower=False, verbose=1):
import nltk
import requests
import string
raw = ''
wordlist = {}
for site in links:
try:
if verbose == 1:
print '[+] fetching data from: ', site
if site.find('http://pastebin.com/') == 0:
raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content
else:
raw = requests.get(site).content
if lower == False:
l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split()
freq_an(l, wordlist)
else:
l = string.lower(nltk.clean_html(raw))
l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split()
freq_an(l, wordlist)
except:
if verbose == 1:
print '[-] Skipping url: ', site
return wordlist
开发者ID:tkisason,项目名称:unhash,代码行数:25,代码来源:gwordlist.py
示例4: parse_file
def parse_file(self, filepath):
"""
Parses a corpus file and initialize the object.
@param filepath: The path of the corpus file to parse.
@type filepath: C{string}
"""
html_file = codecs.open(filepath, "r", "utf-8")
raw_html = html_file.read()
body = raw_html.split("<body>",1)[1]
raw_content = nltk.clean_html(body.split("</h1>", 1)[1])
self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".")
content = ""
for p in raw_content.split("\n"):
p = p.strip()
if p != "":
if content != "":
content += " "
content += p
content = content.split("-", 1)[1].replace(u"\u202F", " ").strip()
self.set_content(content)
html_file.close()
开发者ID:52nlp,项目名称:KeyBench,代码行数:28,代码来源:wikinews2012.py
示例5: scrapeBlog
def scrapeBlog(url, depth): # obs hackkkkkkkkk
allText = ""
pages = getPages(url)
pages = pages[(depth+1):] # take the rest
posts = []
timestamps = []
for url in pages:
response = getContent(url)
repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october')
response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower())
soup = BeautifulSoup(response)
try:
poststext = soup.select(".blogposttext") # get posts text
poststext = [nltk.clean_html(unicode(post)) for post in poststext]
postsdatetime = soup.select(".blogpostheaderdate")
postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime]
postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime]
posts.extend(poststext[0:len(postsdatetime)])
timestamps.extend(postsdatetime)
except:
pass
#allText = allText + "\n\n" + getAllText(url)
return posts, timestamps
开发者ID:maxberggren,项目名称:sinus,代码行数:30,代码来源:nattstad_post19.py
示例6: process_feed
def process_feed(self, entries):
abbr = self.abbr
feed_entries = db.feed_entries
third = itemgetter(2)
# Find matching entities in the feed.
for entry, matches in self.scan_feed(entries):
matches = self.extract_entities(matches)
ids = map(third, matches)
strings = [m.group() for m, _, _ in matches]
assert len(ids) == len(strings)
# Add references and save in mongo.
entry['state'] = abbr # list probably wiser
entry['entity_ids'] = ids or None
entry['entity_strings'] = strings or None
entry['save_time'] = datetime.datetime.utcnow()
entry['_id'] = new_feed_id(entry)
entry['_type'] = 'feedentry'
entry['summary'] = nltk.clean_html(entry['summary'])
try:
entry['summary_detail']['value'] = nltk.clean_html(
entry['summary_detail']['value'])
except KeyError:
pass
feed_entries.save(entry)
msg = 'Found %d related entities in %r'
self.logger.info(msg % (len(ids), entry['title']))
开发者ID:kevinthew,项目名称:openstates,代码行数:32,代码来源:scrape.py
示例7: getKeyList
def getKeyList(testID):
myDataQ = getData(testID,1)
myDataA = getData(testID,0)
userKeyQ = getUserAnnotate(myDataQ)
userKeyA = getUserAnnotate(myDataA)
myCodeListQ = getCodeList(myDataQ)
myCodeListA = getCodeList(myDataA)
myHtml = getHTML(testID)
t1 = []
packQ = []
funcQ = []
for item in myCodeListQ:
try:
p,f = cparPack(nltk.clean_html(item))
packQ += p
funcQ += f
except SyntaxError:
pass
t1 += preProCode(item)
fQ,aQ,vQ,cQ = cparFuncs(t1)
packQ,funcQ = cparPack(t1)
fQ = list(set(fQ))
aQ = list(set(aQ))
vQ = list(set(vQ))
cQ = list(set(cQ))
combQ = []
for cItem in cQ:
for fItem in fQ:
combQ.append(cItem+"."+fItem)
t2 = []
packA = []
funcA = []
for item in myCodeListA:
try:
p,f = cparPack(nltk.clean_html(item))
packA += p
funcA += f
except SyntaxError:
pass
t2 += preProCode(item)
fA,aA,vA,cA = cparFuncs(t2)
fA = list(set(fA))
aA = list(set(aA))
vA = list(set(vA))
cA = list(set(cA))
combA = []
for cItem in cA:
for fItem in fA:
combA.append(cItem+"."+fItem)
keyList = \
list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA))
return keyList
开发者ID:paulyang1990,项目名称:FYT-stackoverflow.com-Summarization,代码行数:60,代码来源:ana.py
示例8: getarticle
def getarticle(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html,from_encoding="utf-8")
titletag = soup.find("h2")
title = nltk.clean_html("{0}".format(titletag))
storytag = soup.findAll('div',{'class':None})[1]
text = nltk.clean_html("{0}".format(storytag))
return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:rhp_scraper.py
示例9: getarticle
def getarticle(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html,from_encoding="utf-8")
titletag = soup.find("h2")
title = nltk.clean_html("{0}".format(titletag))
ptags = soup.find_all("p")
text = nltk.clean_html("{0}".format(ptags[2]))
return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:whec_scraper.py
示例10: preprocess_hotel_review
def preprocess_hotel_review(file_contents, file_contents_test):
"""
Hotel review preprocess and truthfulness of the hotel review
:param file_contents:
:param file_contents_test:
"""
raw = clean_html(file_contents)
raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
sentence_list = tokenize.line_tokenize(raw)
print sentence_list
truth_sentences = []
false_sentences = []
for sentence in sentence_list:
sent_arr = re.split(r',', sentence)
try:
is_truthful = int(sent_arr[0])
except ValueError:
print "is_truthful is not an integer"
if is_truthful == 1:
truth_sentences.append(sent_arr[2])
elif is_truthful == 0:
false_sentences.append(sent_arr[2])
truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))
raw_test = clean_html(file_contents_test)
raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
sentence_list_test = tokenize.line_tokenize(raw_test)
test_list = []
test_truth_false_list = []
truth_count = false_count = i = 0
for sentence in sentence_list_test:
sent_arr = re.split(r',', sentence)
truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
#truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
if truth_or_false:
truth_count += 1
else:
false_count += 1
test_truth_false_list.append([i, truth_or_false])
i += 1
import csv
with open("kaggle_sharp.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows([['Id', 'Label']])
writer.writerows(test_truth_false_list)
print test_list
print test_truth_false_list
print truth_count
print false_count
开发者ID:hs634,项目名称:cs4740,代码行数:57,代码来源:smoothing-ngram.py
示例11: extrait
def extrait(self, rss):
d = feedparser.parse(rss)
h = random.randint(0, len(d['entries']) -1)
print h
print str(len(d['entries']))
titre = nltk.clean_html(d['items'][h].title)
descriptionb = nltk.clean_html(d['items'][h].description)
description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb)
return titre+". \n\n"+description
开发者ID:appnt,项目名称:SiriServer,代码行数:9,代码来源:lecteurFluxRssFrance.py
示例12: __init__
def __init__(self,directory):
#get list of all tags that can be simplified into synonym tags
stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym
rdr= csv.reader(stf)
for r in rdr:
#r[0]=tag r[1]=tag it should be replaced with
self.synonym_tags[r[0]]=r[1]
stf.close()
tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag
rdr=csv.reader(tf)
for r in rdr:
tmp=r[0].split(';') #tmp[0]=tag tmp[1]=frequency
self.tags[tmp[0]]=float(1/float(tmp[1]))
tf.close()
for tmp in self.tags:
t=tmp.split('-')
if len(t)>1:
t2=tmp.replace('-',' ')
#print t2
if t[0] not in self.complex_tags:
self.complex_tags[t[0]]=[]
self.complex_tags[t[0]].append(t2)
#self.complex_tags_replacements[t[0]]=tmp
self.complex_tags_replacements[t2]=tmp
qf=open(directory+"Questions&Answers&Tags.csv",'r')
rdr=csv.reader(qf)
for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags
if r[0][len(r[0])-1] not in ['!','?','.']:
r[0]=r[0]+'.'
r[1]=nltk.clean_html(r[1])
r[2]=nltk.clean_html(r[2])
r[0]=r[0]+' '+r[1]
self.questions.append(r[0])
self.answers.append(r[1])
n=len(self.questions)-1
r[3]=r[3].replace('<','')
r[3]=r[3].replace('>',' ')
tmplist=r[3].split(' ')
for t in tmplist:
if t in self.synonym_tags:
r[3]=r[3].replace(t,self.synonym_tags[t])
tmplist=r[3].split(' ')
tmplist.pop()
self.tagsInQuestions[n]=tmplist
for t in tmplist:
if t not in self.questionsForTags:
self.questionsForTags[t]=[]
self.questionsForTags[t].append(n)
qf.close()
开发者ID:bijilap,项目名称:Doctor-Tux,代码行数:55,代码来源:DoctorTux.py
示例13: index
def index():
steps = Step.query.order_by(Step.num_de_paso)
for step in steps:
if step.tipo_de_tramite:
step.tipo_de_tramite = clean_html(step.tipo_de_tramite)
if step.requisitos:
step.requisitos = clean_html(step.requisitos)
if step.consideraciones:
step.consideraciones = clean_html(step.consideraciones)
if step.preguntas_frecuentes:
step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes)
return render_template('index.html', steps=steps)
开发者ID:CoquiCoders,项目名称:negocio123,代码行数:12,代码来源:negocio123.py
示例14: autos_us
def autos_us():
html = open('autos-us.html').read()
soup = BeautifulSoup(html)
first = soup.find('li').contents[0]
second = first.parent.next_sibling.next_sibling.contents[0]
third = second.parent.next_sibling.next_sibling.contents[0]
majors = [first, second, third]
minors = soup.select('ul li ul li')
major_tokens = [nltk.clean_html(str(w)) for w in majors]
minor_tokens = [nltk.clean_html(str(w)) for w in minors]
minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
tokens = list(set(major_tokens + minor_tokens))
return tokens
开发者ID:cinterloper,项目名称:rap-analysis,代码行数:13,代码来源:counting_cars.py
示例15: gasPrices
def gasPrices(origin, destination):
one_way_cost = ''
from_address = origin
to_address = destination
new_from_address = from_address.replace(" ", "+")
new_to_address = to_address.replace(" ", "+")
url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address
html = urllib.urlopen(url)
for line in html:
if "costofdriving" and "$" in line:
one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", ""))
round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "")
break
return one_way_cost
开发者ID:agadiraju,项目名称:r-3,代码行数:14,代码来源:views.py
示例16: invent_ext
def invent_ext(htmlString):
start = htmlString.find("Inventors:")
end = htmlString.find("Assignee:")
end2 = htmlString.find("Appl. No.:")
if start == -1:
extract = "No Inventors Listed"
else:
if end == -1:
extract = htmlString[start+11:end2]
extract = nltk.clean_html(extract)
else:
extract = htmlString[start+11:end]
extract = nltk.clean_html(extract)
return extract
开发者ID:apgoldst,项目名称:cit-tools,代码行数:15,代码来源:uspto_parser.py
示例17: webUrl
def webUrl(fullUrl):
#urllib2 works best with a specific url format
validUrl = re.compile(
r'^(?:http)s?://|' # http:// or https://
r'^(?:http)s?://www.'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if validUrl.match(fullUrl):
finalList = []
urlInput = quote(fullUrl, safe="%/:=&?~#+!$,;'@()*[]")
urlInput = urlInput.strip('%0A')
try:
u = urlopen(urlInput)
html = u.read()
raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
if args.minLength or args.maxLength:
for token in tokens:
if not(len(token.translate(None,charBlacklist)) < minl or len(token) > maxl):
wordList.append(str(token).translate(None,charBlacklist))
else:
for token in tokens:
wordList.append(str(token).translate(None,charBlacklist))
print "Scraping URL - {0}".format(fullUrl)
except Exception as e:
print 'There was an error connecting to or parsing {0}'.format(fullUrl)
print 'Error: %s' % e
else:
print 'INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl)
开发者ID:stevenswafford,项目名称:script-farm,代码行数:33,代码来源:SmeegeScrape.py
示例18: retrieve_editorial
def retrieve_editorial(self, a_url):
editorial =[]
# Open URL object
print a_url, " < url"
try:
contents = self.url_read(a_url)
para_ct = 0
for para in re.finditer(r'<p>(.*?)</p>', contents, re.DOTALL):
try:
para = para.groups()[0]
if dbg: print "para ", len(para)
para_ct += len(para)
cleaned = nltk.clean_html(para)
self.toks = cleaned.split()
# self.toks = nltk.word_tokenize(cleaned)
self.toks = [it.lower() for it in self.toks]
self.remove_punctuation()
if dbg: print(self.toks)
editorial.extend(self.toks)
except Exception, e:
print para
print e
print para_ct, 'symbols'
开发者ID:milliondreams,项目名称:think-link,代码行数:30,代码来源:scrape_news.py
示例19: Create_index_from_url
def Create_index_from_url( url, depth ):
if depth > MAX_DEPTH:
return []
url_queue = Queue()
url_queue.put( url )
checked = []
IndexGen = Index_Generator()
while not url_queue.empty() :
current_url = url_queue.get()
checked.append( current_url )
try:
html = Get_page( current_url )
except:
print "Exception"
continue
if depth > 0:
for link in Link_generator( html ):
#print link
if link not in checked:
url_queue.put( link )
depth = depth - 1
html = nltk.clean_html( html )
IndexGen.gen_url_index( current_url, html )
result_index = {}
result_index = IndexGen.get_index_dict()
for key in result_index:
result_index[key].sort()
return result_index
开发者ID:antonfait,项目名称:SerchEngine,代码行数:34,代码来源:search_engine.py
示例20: get_xmen_text
def get_xmen_text(soup):
#en_stopwords = set(nltk.corpus.stopwords.words('english'))
raw = nltk.clean_html(str(soup))
raw_trunc = raw[:raw.rfind('References')]
sents = nltk.sent_tokenize(raw_trunc)
words = [nltk.word_tokenize(sent) for sent in sents]
poss = [nltk.pos_tag(word) for word in words]
#nes = [nltk.ne_chunk(pos, binary=True) for pos in poss]
#for pos in poss: print pos
poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss]
print poss_filter
nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter]
def sub_leaves(tree, node):
return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)]
people = [sub_leaves(ne, 'NE') for ne in nes]
people = [item for sublist in people
for subsublist in sublist
for subsubsublist in subsublist
for item in subsubsublist
if item not in ('NNP', 'NN', 'NNPS', 'JJ')]
people = merge_people(people)
fd = nltk.FreqDist(person for person in people if person!='Magneto')
fd.plot(50)
开发者ID:bdewilde,项目名称:nltk_sandbox,代码行数:26,代码来源:scrape_xmen.py
注:本文中的nltk.clean_html函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论