• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python nltk.clean_html函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.clean_html函数的典型用法代码示例。如果您正苦于以下问题:Python clean_html函数的具体用法?Python clean_html怎么用?Python clean_html使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了clean_html函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: parse

    def parse(self, fname):
        try:
            with open(fname, "r") as f:
                log.info("Process %s" % fname)
                soup = BeautifulSoup(f.read())
                tbl = soup.find("table", { "class" : "cable" })
                docid = tbl.findAll('tr')[1].\
                        findAll('td')[0].contents[1].contents[0]

                if docid in self.docids:
                    return True

                doc = {
                        "_id": docid,
                        "refererence_id": docid,
                        "date_time": tbl.findAll('tr')[1].\
                                findAll('td')[1].contents[1].contents[0],
                        "classification": tbl.findAll('tr')[1].\
                                findAll('td')[2].contents[1].contents[0],
                        "origin": tbl.findAll('tr')[1].\
                                findAll('td')[3].contents[1].contents[0],
                        "header":nltk.clean_html(str(soup.findAll(['pre'])[0])),
                        "body": nltk.clean_html(str(soup.findAll(['pre'])[1]))
                }
                
                return doc

        except OSError:
            log.error("Can't open '%s'" % fname)
            self.processed -= 1
开发者ID:benoitc,项目名称:cablesgate,代码行数:30,代码来源:cablesgate.py


示例2: extract_content

 def extract_content(self,raw):
   logging.info('Processor.extract_content')
   
   soup = BeautifulSoup(raw)
   cable_table = soup.find("table", { "class" : "cable" })
   cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\
     .contents[1].contents[0]
   if db.cables.find_one({'_id':cable_id}):
     self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1
     logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]')
     self.print_counts()
     return
     
   cable = Cable(raw)
   cable['_id'] = cable_id
   cable['reference_id'] = cable_id
   cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\
     .contents[1].contents[0]
   cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\
     .contents[1].contents[0]
   cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\
     .contents[1].contents[0]
   cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0]))
   cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1]))
   
   db.cables.insert(cable.get())
   
   self.counts['files_processed'] = self.counts['files_processed'] + 1
   
   self.print_counts()
   
   if (self.counts['files_processed'] + self.counts['files_not_processed'])\
     == self.counts['files_to_process']:
     self.dump_json()
开发者ID:anarchivist,项目名称:cablegate,代码行数:34,代码来源:process.py


示例3: scrape_links_and_wordlistify

def scrape_links_and_wordlistify(links, lower=False, verbose=1):
    import nltk
    import requests
    import string
    raw = ''
    wordlist = {}
    for site in links:
        try:
            if verbose == 1:
                print '[+] fetching data from: ', site
            if site.find('http://pastebin.com/') == 0:
                raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content
            else:
                raw = requests.get(site).content
            if lower == False:
                l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
            else:
                l = string.lower(nltk.clean_html(raw))
                l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
        except:
            if verbose == 1:
                print '[-] Skipping url: ', site
    return wordlist
开发者ID:tkisason,项目名称:unhash,代码行数:25,代码来源:gwordlist.py


示例4: parse_file

  def parse_file(self, filepath):
    """
    Parses a corpus file and initialize the object.
    
    @param  filepath: The path of the corpus file to parse.
    @type   filepath: C{string}
    """

    html_file = codecs.open(filepath, "r", "utf-8")
    raw_html = html_file.read()
    body = raw_html.split("<body>",1)[1]
    raw_content = nltk.clean_html(body.split("</h1>", 1)[1])

    self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".")
    
    content = ""
    for p in raw_content.split("\n"):
      p = p.strip()

      if p != "":
        if content != "":
          content += " "
        content += p
    content = content.split("-", 1)[1].replace(u"\u202F", " ").strip()

    self.set_content(content)

    html_file.close()
开发者ID:52nlp,项目名称:KeyBench,代码行数:28,代码来源:wikinews2012.py


示例5: scrapeBlog

def scrapeBlog(url, depth): # obs hackkkkkkkkk
    allText = ""
    pages = getPages(url)
    pages = pages[(depth+1):] # take the rest
    posts = []
    timestamps = []
    
    for url in pages:
        response = getContent(url)
        repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october')
        response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower())
        
        soup = BeautifulSoup(response)
        
        
        try:
            poststext = soup.select(".blogposttext") # get posts text
            poststext = [nltk.clean_html(unicode(post)) for post in poststext]
            postsdatetime = soup.select(".blogpostheaderdate")
            
            postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime]
            postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime]
            
            posts.extend(poststext[0:len(postsdatetime)])
            timestamps.extend(postsdatetime)
        except:
            pass
        #allText = allText + "\n\n" + getAllText(url)
    
    return posts, timestamps
开发者ID:maxberggren,项目名称:sinus,代码行数:30,代码来源:nattstad_post19.py


示例6: process_feed

    def process_feed(self, entries):
        abbr = self.abbr
        feed_entries = db.feed_entries
        third = itemgetter(2)

        # Find matching entities in the feed.
        for entry, matches in self.scan_feed(entries):                    
            matches = self.extract_entities(matches)

            ids = map(third, matches)
            strings = [m.group() for m, _, _ in matches]
            assert len(ids) == len(strings)

            # Add references and save in mongo.
            
            entry['state'] = abbr # list probably wiser
            entry['entity_ids'] = ids or None
            entry['entity_strings'] = strings or None
            entry['save_time'] = datetime.datetime.utcnow()
            entry['_id'] = new_feed_id(entry)
            entry['_type'] = 'feedentry'

            entry['summary'] = nltk.clean_html(entry['summary'])
            try:
                entry['summary_detail']['value'] = nltk.clean_html(
                    entry['summary_detail']['value'])
            except KeyError:
                pass
            
            feed_entries.save(entry)
            msg = 'Found %d related entities in %r'
            self.logger.info(msg % (len(ids), entry['title']))
开发者ID:kevinthew,项目名称:openstates,代码行数:32,代码来源:scrape.py


示例7: getKeyList

def getKeyList(testID):
    myDataQ = getData(testID,1)
    myDataA = getData(testID,0)

    userKeyQ = getUserAnnotate(myDataQ)
    userKeyA = getUserAnnotate(myDataA)

    myCodeListQ = getCodeList(myDataQ)
    myCodeListA = getCodeList(myDataA)
    myHtml = getHTML(testID)
    
    t1 = []
    packQ = []
    funcQ = []
    for item in myCodeListQ:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packQ += p 
            funcQ += f
        except SyntaxError:
            pass
        t1 += preProCode(item)
    fQ,aQ,vQ,cQ = cparFuncs(t1) 
    packQ,funcQ = cparPack(t1)
    fQ = list(set(fQ))
    aQ = list(set(aQ))
    vQ = list(set(vQ))
    cQ = list(set(cQ))

    combQ = []
    for cItem in cQ:
        for fItem in fQ:
            combQ.append(cItem+"."+fItem) 

    t2 = []
    packA = []
    funcA = []
    for item in myCodeListA:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packA += p 
            funcA += f
        except SyntaxError:
            pass
        t2 += preProCode(item)
    fA,aA,vA,cA = cparFuncs(t2) 
    fA = list(set(fA))
    aA = list(set(aA))
    vA = list(set(vA))
    cA = list(set(cA))

    combA = []
    for cItem in cA:
        for fItem in fA:
            combA.append(cItem+"."+fItem) 

    keyList = \
    list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA))

    return keyList
开发者ID:paulyang1990,项目名称:FYT-stackoverflow.com-Summarization,代码行数:60,代码来源:ana.py


示例8: getarticle

def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    storytag = soup.findAll('div',{'class':None})[1]
    text = nltk.clean_html("{0}".format(storytag))
    return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:rhp_scraper.py


示例9: getarticle

def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    ptags = soup.find_all("p")
    text = nltk.clean_html("{0}".format(ptags[2]))
    return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:whec_scraper.py


示例10: preprocess_hotel_review

def preprocess_hotel_review(file_contents, file_contents_test):
    """
    Hotel review preprocess and truthfulness of the hotel review
    :param file_contents:
    :param file_contents_test:
    """
    raw = clean_html(file_contents)
    raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
    sentence_list = tokenize.line_tokenize(raw)
    print sentence_list
    truth_sentences = []
    false_sentences = []
    for sentence in sentence_list:
        sent_arr = re.split(r',', sentence)
        try:
            is_truthful = int(sent_arr[0])
        except ValueError:
            print "is_truthful is not an integer"

        if is_truthful == 1:
            truth_sentences.append(sent_arr[2])
        elif is_truthful == 0:
            false_sentences.append(sent_arr[2])

    truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
    false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))

    raw_test = clean_html(file_contents_test)
    raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
    sentence_list_test = tokenize.line_tokenize(raw_test)
    test_list = []
    test_truth_false_list = []
    truth_count = false_count = i = 0
    for sentence in sentence_list_test:
        sent_arr = re.split(r',', sentence)
        truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
        false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
        test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
        truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
        #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
        if truth_or_false:
            truth_count += 1
        else:
            false_count += 1
        test_truth_false_list.append([i, truth_or_false])
        i += 1

    import csv

    with open("kaggle_sharp.csv", "wb") as f:
        writer = csv.writer(f)
        writer.writerows([['Id', 'Label']])
        writer.writerows(test_truth_false_list)
    print test_list
    print test_truth_false_list
    print truth_count
    print false_count
开发者ID:hs634,项目名称:cs4740,代码行数:57,代码来源:smoothing-ngram.py


示例11: extrait

    def extrait(self, rss):
	d = feedparser.parse(rss)
	h = random.randint(0, len(d['entries']) -1)
	print h
	print str(len(d['entries']))
	titre = nltk.clean_html(d['items'][h].title)
	descriptionb = nltk.clean_html(d['items'][h].description)
	description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb)
	return titre+". \n\n"+description
开发者ID:appnt,项目名称:SiriServer,代码行数:9,代码来源:lecteurFluxRssFrance.py


示例12: __init__

	def __init__(self,directory):
		#get list of all tags that can be simplified into synonym tags
		stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym
		rdr= csv.reader(stf)
		for r in rdr:  
			#r[0]=tag  r[1]=tag it should be replaced with
			self.synonym_tags[r[0]]=r[1]
		stf.close()

		tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag
		rdr=csv.reader(tf)
		for r in rdr:
			tmp=r[0].split(';') #tmp[0]=tag      tmp[1]=frequency
			self.tags[tmp[0]]=float(1/float(tmp[1]))
		tf.close()

		for tmp in self.tags:
			t=tmp.split('-')
			if len(t)>1:
				t2=tmp.replace('-',' ')
				#print t2
				if t[0] not in self.complex_tags:
					self.complex_tags[t[0]]=[]

				self.complex_tags[t[0]].append(t2)
				#self.complex_tags_replacements[t[0]]=tmp
				self.complex_tags_replacements[t2]=tmp

		qf=open(directory+"Questions&Answers&Tags.csv",'r')
		rdr=csv.reader(qf)
		for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags
			if r[0][len(r[0])-1] not in ['!','?','.']:
				r[0]=r[0]+'.'
			r[1]=nltk.clean_html(r[1])
			r[2]=nltk.clean_html(r[2])
			r[0]=r[0]+' '+r[1]
			self.questions.append(r[0])
			self.answers.append(r[1])
			n=len(self.questions)-1
			r[3]=r[3].replace('<','')
			r[3]=r[3].replace('>',' ')
			tmplist=r[3].split(' ')
			for t in tmplist:
				if t in self.synonym_tags:
					r[3]=r[3].replace(t,self.synonym_tags[t])

			tmplist=r[3].split(' ')
			tmplist.pop()
			self.tagsInQuestions[n]=tmplist
			for t in tmplist:
				if t not in self.questionsForTags:
					self.questionsForTags[t]=[]
				self.questionsForTags[t].append(n)

		qf.close()
开发者ID:bijilap,项目名称:Doctor-Tux,代码行数:55,代码来源:DoctorTux.py


示例13: index

def index():
  steps = Step.query.order_by(Step.num_de_paso)
  for step in steps:
    if step.tipo_de_tramite:
      step.tipo_de_tramite = clean_html(step.tipo_de_tramite)
    if step.requisitos:
      step.requisitos = clean_html(step.requisitos)
    if step.consideraciones:
      step.consideraciones = clean_html(step.consideraciones)
    if step.preguntas_frecuentes:
      step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes)
  return render_template('index.html', steps=steps)
开发者ID:CoquiCoders,项目名称:negocio123,代码行数:12,代码来源:negocio123.py


示例14: autos_us

def autos_us():
    html = open('autos-us.html').read()
    soup = BeautifulSoup(html)
    first = soup.find('li').contents[0]
    second = first.parent.next_sibling.next_sibling.contents[0]
    third = second.parent.next_sibling.next_sibling.contents[0]
    majors = [first, second, third]
    minors = soup.select('ul li ul li')
    major_tokens = [nltk.clean_html(str(w)) for w in majors]
    minor_tokens = [nltk.clean_html(str(w)) for w in minors]
    minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
    tokens = list(set(major_tokens + minor_tokens))
    return tokens
开发者ID:cinterloper,项目名称:rap-analysis,代码行数:13,代码来源:counting_cars.py


示例15: gasPrices

def gasPrices(origin, destination):
	one_way_cost = ''
	from_address = origin
	to_address = destination
	new_from_address = from_address.replace(" ", "+")
	new_to_address = to_address.replace(" ", "+")
	url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address
	html = urllib.urlopen(url)
	for line in html:
		if "costofdriving" and "$" in line:
			one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", ""))
			round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "")
			break
	return one_way_cost
开发者ID:agadiraju,项目名称:r-3,代码行数:14,代码来源:views.py


示例16: invent_ext

def invent_ext(htmlString):
    start = htmlString.find("Inventors:")
    end = htmlString.find("Assignee:")
    end2 = htmlString.find("Appl. No.:")
    if start == -1:
        extract = "No Inventors Listed"
    else:
        if end == -1:
            extract = htmlString[start+11:end2]
            extract = nltk.clean_html(extract)
        else:
            extract = htmlString[start+11:end]
            extract = nltk.clean_html(extract)
    
    return extract
开发者ID:apgoldst,项目名称:cit-tools,代码行数:15,代码来源:uspto_parser.py


示例17: webUrl

def webUrl(fullUrl):
    #urllib2 works best with a specific url format
    validUrl = re.compile(
        r'^(?:http)s?://|' # http:// or https://
        r'^(?:http)s?://www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    if validUrl.match(fullUrl):
        finalList = []
        urlInput = quote(fullUrl, safe="%/:=&?~#+!$,;'@()*[]")
        urlInput = urlInput.strip('%0A')
        try:
            u = urlopen(urlInput)
            html = u.read()
            raw = nltk.clean_html(html)
            tokens = nltk.word_tokenize(raw)
            if args.minLength or args.maxLength:
                for token in tokens:
                    if not(len(token.translate(None,charBlacklist)) < minl or len(token) > maxl):
                        wordList.append(str(token).translate(None,charBlacklist))
            else:
                for token in tokens:
                    wordList.append(str(token).translate(None,charBlacklist))
            print "Scraping URL - {0}".format(fullUrl)
        except Exception as e:
            print 'There was an error connecting to or parsing {0}'.format(fullUrl)
            print 'Error: %s' % e
    else:
        print 'INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl)
开发者ID:stevenswafford,项目名称:script-farm,代码行数:33,代码来源:SmeegeScrape.py


示例18: retrieve_editorial

    def retrieve_editorial(self, a_url):

        editorial =[]
        # Open URL object
        print a_url, " < url"

        try:

            contents = self.url_read(a_url)

            para_ct = 0
            for para in re.finditer(r'<p>(.*?)</p>', contents, re.DOTALL):
                try:
                    para = para.groups()[0]
                    if dbg: print "para ", len(para)
                    para_ct += len(para)
                    cleaned = nltk.clean_html(para)
                    self.toks = cleaned.split()
                    # self.toks  = nltk.word_tokenize(cleaned)
                    self.toks = [it.lower() for it in self.toks]
                    self.remove_punctuation()
                    if dbg: print(self.toks)
                    editorial.extend(self.toks)
                except Exception, e:
                    print para
                    print e



            print para_ct, 'symbols'
开发者ID:milliondreams,项目名称:think-link,代码行数:30,代码来源:scrape_news.py


示例19: Create_index_from_url

def Create_index_from_url( url, depth ):
    if depth > MAX_DEPTH:
        return []
    url_queue = Queue()
    url_queue.put( url )
    checked = []

    IndexGen = Index_Generator()
    while not url_queue.empty() :

        current_url = url_queue.get()

        checked.append( current_url )

        try:
            html = Get_page( current_url )
        except:
            print "Exception"
            continue
        if depth > 0:
            for link in Link_generator( html ):
                #print link
                if link not in checked:
                    url_queue.put( link )
            depth = depth - 1

        html = nltk.clean_html( html )
        IndexGen.gen_url_index( current_url, html )
        result_index = {}
        result_index = IndexGen.get_index_dict()
        for key in result_index:
            result_index[key].sort()

    return result_index
开发者ID:antonfait,项目名称:SerchEngine,代码行数:34,代码来源:search_engine.py


示例20: get_xmen_text

def get_xmen_text(soup):
    
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    raw = nltk.clean_html(str(soup))
    raw_trunc = raw[:raw.rfind('References')]
    sents = nltk.sent_tokenize(raw_trunc)
    words = [nltk.word_tokenize(sent) for sent in sents]
    poss = [nltk.pos_tag(word) for word in words]
    #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss]
    #for pos in poss: print pos
    poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss]
    print poss_filter
    nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter]
    
    def sub_leaves(tree, node):
        return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)]
    
    people = [sub_leaves(ne, 'NE') for ne in nes]
    people = [item for sublist in people
              for subsublist in sublist
              for subsubsublist in subsublist
              for item in subsubsublist
              if item not in ('NNP', 'NN', 'NNPS', 'JJ')]
    people = merge_people(people)
    fd = nltk.FreqDist(person for person in people if person!='Magneto')
    fd.plot(50)
开发者ID:bdewilde,项目名称:nltk_sandbox,代码行数:26,代码来源:scrape_xmen.py



注:本文中的nltk.clean_html函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.defaultdict函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.bigrams函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap