• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python plaintext.PlaintextParser类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sumy.parsers.plaintext.PlaintextParser的典型用法代码示例。如果您正苦于以下问题:Python PlaintextParser类的具体用法?Python PlaintextParser怎么用?Python PlaintextParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了PlaintextParser类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: summarize

def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
开发者ID:mtpain,项目名称:iatv,代码行数:26,代码来源:iatv.py


示例2: test_split_into_words

 def test_split_into_words(self):
     sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", 
         Tokenizer("english")).document.sentences
     self.assertEqual(["One", "two", "two", "Two", "Three"], 
         _split_into_words(sentences1))
     
     sentences2 = PlaintextParser.from_string("two two. Two. Three.", 
         Tokenizer("english")).document.sentences
     self.assertEqual(["two", "two", "Two", "Three"], 
         _split_into_words(sentences2))
开发者ID:JyothsnaKS,项目名称:sumy,代码行数:10,代码来源:test_evaluation.py


示例3: _firstK_score

def _firstK_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))

    geneSen = parser.document.sentences[:SENTENCES_COUNT]
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences

    # print geneSen
    # print "=========="
    # print refSen
    # print evaluate(geneSen, refSen)
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
开发者ID:KevinWangTHU,项目名称:data_stat,代码行数:16,代码来源:summary.py


示例4: test_get_word_ngrams

 def test_get_word_ngrams(self):
     sentences = PlaintextParser.from_string("This is a test.", 
         Tokenizer("english")).document.sentences
     correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
     found_ngrams = _get_word_ngrams(2, sentences)
     for ngram in correct_ngrams:
         self.assertTrue(ngram in found_ngrams)      
开发者ID:JyothsnaKS,项目名称:sumy,代码行数:7,代码来源:test_evaluation.py


示例5: summarize

def summarize(corpus, length, algorithm):
    summarizer = None
    summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)"
    algorithm = algorithm.lower()
    try:
        parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE))
        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(LANGUAGE))

        if summarizer:
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary

    except Exception as e:
        return str(e)
开发者ID:ferryxo,项目名称:IUSE,代码行数:31,代码来源:AutoSummary.py


示例6: kl_rank_sum

def kl_rank_sum(path, K):
    filename = path
    K = K
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, K) #number of sentences in parenthecies
    return summary
开发者ID:danskey,项目名称:matters_of_concern,代码行数:7,代码来源:sumText.py


示例7: get_summary

def get_summary(source_text, compression_factor):
    """
    Given some input source_text, returns its summary based on the chosen 
    compression factor.
    """
    summary = {
        'source_text': source_text,
        'compression_factor': compression_factor,
        'summary': '',
        'success': False
    }
    
    parser = PlaintextParser.from_string(source_text, Tokenizer("english"))
    summ_algo = LexRankSummarizer()
    final_line_num = \
        int(source_text.count('.')/compression_factor)
    try:
        raw_summary = summ_algo(parser.document, final_line_num)
        for sentence in raw_summary:
            summary['summary'] += str(sentence) + ' '
    except:
        pass

    summary['success'] = (len(summary['summary']) != 0)

    return summary
开发者ID:luigidintrono,项目名称:splitsummarizer-web,代码行数:26,代码来源:summarize.py


示例8: summarize_with_info

    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
开发者ID:ferryxo,项目名称:AutoSummaryV1,代码行数:27,代码来源:AutoSummary.py


示例9: summarizeFile

def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
开发者ID:kansal,项目名称:Sub-Event-Detection,代码行数:31,代码来源:content_url.py


示例10: summarize

    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary
开发者ID:peerlogic,项目名称:AutoSummaryV1,代码行数:26,代码来源:AutoSummary.py


示例11: summarize

def summarize(string, summary_length = 1, language = "english"):
    string = string.lower() if string.isupper() else string
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)]) 
开发者ID:jwmueller,项目名称:SemanticTextDB,代码行数:8,代码来源:summarizer.py


示例12: sumrise

def sumrise(text = text, sentences = 5):
    if (validators.url(text)): text = web2text.getwebtxt(text)

    parser = PlaintextParser.from_string(text, Tokenizer('english'))
    summerizer = LsaSummarizer()

    summary = str(summerizer(parser.document, sentences))
    return summary
开发者ID:aniruddhasanyal,项目名称:Texty,代码行数:8,代码来源:sumrise.py


示例13: summarize

    def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'):
        '''
        Summarizes the extracted references based on community detection

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
            mode(str) -- can be citance, reference 

        '''
        citances = defaultdict(list)
        summarizer = LexRankSummarizer(Stemmer('english'))
        summary = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            citances[t[0]['topic']].append(
                {'refs': t[0]['sentence'],
                 'citance': self.clean_citation(t[0]['citation_text'])})

        for topic, citance in citances.iteritems():
            # Create graph of citation similarities
            vectorizer = TfidfVectorizer(
                tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9)
            cit_vectors = vectorizer.fit_transform(
                [e['citance'] for e in citance]).toarray()
            cit_text = {
                i: v for i, v in enumerate(citance)}
            cit_dict = {i: v for i, v in enumerate(cit_vectors)}
            cits = []
            for e in cit_dict:  # vector (numpy array)
                for e1 in cit_dict:
                    if e != e1:
                        simil = self.cossim(cit_dict[e],
                                            cit_dict[e1])
                        if simil > 0.1:
                            cits.append((e, e1, simil))
            G = nx.Graph()
            G.add_weighted_edges_from(cits)
            part = community.best_partition(G)
            clusters = defaultdict(list)
            tokenize = SentTokenizer(offsets=False)
            for k, v in part.iteritems():
                clusters[v].extend(tokenize(citance[k]['refs']))
            # clusters includes ref sentences that belong in each cluster
            # Find the most salient sentence in each cluster
            sal_in_cluster = {}  # salient sentences for each cluster
            for i in clusters:
                parser = PlaintextParser.from_string(
                    ' '.join(clusters[i]).replace('\\', ''), Tokenizer('english'))
                summ = summarizer(parser.document, 5)
                # 5 is the number of sentences returned by LexRank
                sal_in_cluster[i] = [unicode(s) for s in summ]
                # The most salient sentences in each cluster
            summary[topic.upper()] =\
                self.pick_from_cluster(
                    sal_in_cluster, max_length, weighted=False)
        return summary
开发者ID:acohan,项目名称:scientific-summ,代码行数:58,代码来源:comm_summarizer.py


示例14: summarize

    def summarize(self, extracted_refs, facet_results, max_length=250):
        '''
        Summarizes the extracted references based on the facet results

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
        '''
        summaries = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            topic = t[0]['topic']
            citance = t[0]['citance_number']
            if isinstance(t[0]['sentence'][0], list):
                logger.warn('Unexpected, should check')
            summaries[topic.upper()]\
                [facet_results[topic.upper()]
                 [str(citance)]['SVM_LABEL']].append([t[0]['citation_text']])

        summarizer = TextRankSummarizer(Stemmer('english'))

        final_summ = defaultdict(lambda: defaultdict(dict))
        ret_summ = defaultdict(list)
        counts = defaultdict(lambda: defaultdict(dict))
        for t in summaries:
            for facet in summaries[t]:
                if len(summaries[t][facet]) > 1:
                    summs = list(
                        itertools.chain.from_iterable(summaries[t][facet]))
                    parser = PlaintextParser.from_string(
                        ' '.join(summs), Tokenizer('english'))
                    summ = summarizer(parser.document, max_length)
                    final_summ[t][facet] = [unicode(sent) for sent in summ]
                    counts[t][facet] = len(final_summ[t][facet])
                else:
                    final_summ[t][facet] = self.s_t(summaries[t][facet][0])
            i = 0
            while self.w_t.count_words(ret_summ[t]) < max_length:
                for fct in final_summ[t]:
                    if i < len(final_summ[t][fct]):
                        ret_summ[t].append(final_summ[t][fct][i])
                i += 1
            while self.w_t.count_words(ret_summ[t]) > max_length:
                ret_summ[t].pop()


#         summ = defaultdict(list)
#         tokzer = WordTokenizer(stem=False)
#         for k in final_summ:
#             i = 0
#             while tokzer.count_words(summ[k]) < max_length:
#                 for f in final_summ[k]:
#                     if len(final_summ[k][f]) > i and\
#                             tokzer.count_words(summ[k]) < max_length:
#                         summ[k].append(final_summ[k][f][i])
        return ret_summ
开发者ID:acohan,项目名称:scientific-summ,代码行数:57,代码来源:citance_facet.py


示例15: summarize

def summarize(text):
    total = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        total += str(sentence)
    return total
开发者ID:notexactlyawe,项目名称:paper-reader,代码行数:9,代码来源:summarize.py


示例16: getSummary

 def getSummary(self, num_sentences):
     lex_rank = LexRankSummarizer()
     text = str(self.bpLargGetText())
     parser = PlaintextParser.from_string(text, Tokenizer('english'))
     summary = lex_rank(parser.document, num_sentences)
     sentences = []
     for sent in summary:
         sentences.append(str(sent))
     return sentences
开发者ID:hyperdemon,项目名称:MatchingMusicWithTheSemanticContentOfTheWeb,代码行数:9,代码来源:my_parser.py


示例17: summary

def summary():
    max_sent = 10
    language = 'english'
    url = request.form['summary']
    tokenizer = Tokenizer(language)
    article = alt_extract(url)
    parser = PlaintextParser.from_string(article, tokenizer)
    summary = summarizer(parser, max_sent, language).decode('utf-8')
    return render_template('summary.html', url=url, summary=summary)
开发者ID:jjangsangy,项目名称:neuralpoet,代码行数:9,代码来源:site.py


示例18: get_summary

	def get_summary(self, text):
		parser = PlaintextParser.from_string(text, Tokenizer("english"))
		summarizer = LexRankSummarizer()
		summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences

		result = ""
		for sentence in summary:
			result += " " + str(sentence)

		return result
开发者ID:gitzain,项目名称:project-x,代码行数:10,代码来源:story.py


示例19: summarize

def summarize(content):
    parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = '\n'.join(
        [str(sentence) for sentence in summarizer(parser.document, COUNT)]
    )
    summary = Summary(content=content, summary=text)
    summary.save()
开发者ID:tishmen,项目名称:keyword_research,代码行数:10,代码来源:contents_helpers.py


示例20: summarizeText

 def summarizeText(self, body, numSentences = 10):
     """Summarizes body of text to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
开发者ID:mchenchen,项目名称:HouseDoctor,代码行数:10,代码来源:Summarizer.py



注:本文中的sumy.parsers.plaintext.PlaintextParser类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python edmundson.EdmundsonSummarizer类代码示例发布时间:2022-05-27
下一篇:
Python _compat.to_unicode函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap