• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python brown.sents函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.brown.sents函数的典型用法代码示例。如果您正苦于以下问题:Python sents函数的具体用法?Python sents怎么用?Python sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: load_sentences_brown

def load_sentences_brown(nb_sentences=None):
    """
    :param nb_sentences: Use if all brown sentences are too many
    :return: index2word (list of string)
    """
    from nltk.corpus import brown
    import gensim

    print 'building vocab ...'

    if nb_sentences is None:
        sents = brown.sents()
    else:
        sents = brown.sents()[:nb_sentences]

    # I use gensim model only for building vocab
    model = gensim.models.Word2Vec()
    model.build_vocab(sents)
    vocab = model.vocab

    # ids: list of (list of word-id)
    ids = [[vocab[w].index for w in sent
            if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32]
           for sent in sents]

    return ids, model.index2word
开发者ID:perrier1034,项目名称:skipgram-word2vec-keras,代码行数:26,代码来源:utils.py


示例2: clean

    def clean():

        '''
        1. Removes any individual special character.
        2. Lowers all the words.
        :return: list of clean sentences
        '''

        sents = list(brown.sents())
        sents_copy = list(brown.sents())
        n = len(sents)
        print 'Removing special chars...'
        for i in range(0, n):
            for word in sents[i]:
                if not bool(re.search('[A-Za-z0-9]', word)):
                    sents_copy[i].remove(word)
        print 'Removed special chars.'
        sents = None

        print 'Lowercasing all the words...'
        for i in range(0, n):
            m = len(sents_copy[i])
            for j in range(0, m):
                sents_copy[i][j] = sents_copy[i][j].lower()
        print 'Lowered all the words.'
        return sents_copy
开发者ID:CRUZEAAKASH,项目名称:ArticleWriter,代码行数:26,代码来源:BrownDataCleaner.py


示例3: print_brown

def print_brown():
    from nltk.corpus import brown
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids=['cg22'])
    print brown.sents(categories=['news','reviews'])
    news_text=brown.words(categories='news')
    fdist=nltk.FreqDist([w.lower() for w in news_text])
    modals=['can','could','may','might','must','will']
    for m in modals:
        print m+':',fdist[m]
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py


示例4: load_movie_corpus_each_sentence

def load_movie_corpus_each_sentence(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import movie_reviews as corpus
        return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
开发者ID:zjusuyong,项目名称:multi_grain_lda,代码行数:7,代码来源:vocabulary_for_mglda.py


示例5: find_ngrams

	def find_ngrams(self, n):
		""" Input: the 'n' of 'n-grams'

			Find all the n-grams in the brown corpus. Store in frequency dictionary.
			Optionally it can be decided to use more corpora in order to have more data.

			Note: these are of course n-grams based on going through the sentence from left to right
			If we want to give the correction back based on the dependency tree, we need to
			parse the brown corpus (or any other data set) with the dependency parser, so that
			we can use this data. 			

		"""
		
		total_ngram_count = 0
		ngram_freq_dict = {}

		sents = brown.sents()
		for sent in sents:
			sent = ['-START-']*(n-1)+sent
			ngrams_brown = ngrams(sent, n)
			
			for i in ngrams_brown:
				total_ngram_count += 1
				old = ngram_freq_dict.get(i,0)
				old += 1
				ngram_freq_dict[i] = old
				#print i,old

		return ngram_freq_dict, total_ngram_count
开发者ID:Tomaat,项目名称:grammarCorrector,代码行数:29,代码来源:correction.py


示例6: data_api

def data_api(spilt_rate):
    raw_sent = brown.sents()
    partial_data = raw_sent[:int(0.1*len(raw_sent))]

    data_x, data_y = prepare_0(partial_data, word2intdict)

    print 'len data_x', len(data_x), len(data_y)

    train_inds = npr.choice(range(len(data_x)), size = int((1 - spilt_rate) * len(data_x)), replace = False)
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    print 'len train_inds', len(train_inds), len(data_x)
    for i in range(len(data_x)):
        if i in train_inds:
        	#print 'trn', i
            X_train.append(data_x[i])
            Y_train.append(data_y[i])
        else :
        	#print 'tst', i
            X_test.append(data_x[i])
            Y_test.append(data_y[i])
    print 'len X_train', len(X_train), len(X_test)
    return (X_train, Y_train), (X_test, Y_test)
开发者ID:taineleau,项目名称:Neural-Learner-for-English-Language-Test,代码行数:25,代码来源:get_data.py


示例7: lookupTagger

def lookupTagger():

    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    baseline_tagger.evaluate(brown_tagged_sents)

    sent = brown.sents(categories='news')[3]
    baseline_tagger.tag(sent)

    baseline_tagger = nltk.UnigramTagger(model=likely_tags,
            backoff=nltk.DefaultTagger('NN'))

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2 ** pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()   
开发者ID:AkiraKane,项目名称:Python,代码行数:31,代码来源:c05_auto_tagging.py


示例8: read_datas

 def read_datas(self):
     brown_tagged_sentence  = brown.tagged_sents()
     brown_sent = brown.sents()
     size = int(len(brown_tagged_sentence) * 0.9)
     train_set =  brown_tagged_sentence[:size]
     test_set = brown_tagged_sentence[size:]
     return (train_set,test_set)
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:7,代码来源:tagger.py


示例9: build_index

def build_index(out_filename, in_filename = None):
    '''Builds data files for word lookup. Can take an optional input file
    to add to the data pool which is processed (not working).
    Data is then dumped to a pickle file.'''

    sents_data = []
    try:
        in_file = open(in_filename).read()
        sents_data += sent_tokenize(in_file)
        in_file.close()
    except:
        print("Warning: Failed to load external file for building.")

    sents_data += brown.sents() + treebank.sents()

    # get sentences, chop of rtheir ambiguous heads, and look at their words!
    mysents = [sent[1:] for sent in sents_data]
    # flatten sublists of words to list of words
    mywords = [word for word in mysents for word in word]
    cfd = ConditionalFreqDist((word.lower(), word) for word in mywords)
    # look up most frequent form of lowercase word by doing cfd['word'].max()
    # but need to check for existance of word in cfd first

    # made pickle file too large and slow
    # wordlist = set(words.words())
    # wordlist.update(brown.words())
    # wordlist.update(treebank.words())
    # common_words_lower = set([w for w in wordlist if w.islower()])
    # common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])

    out_file = open(out_filename, 'wb')
    pickle.dump(cfd, out_file, 2)
    # pickle.dump(common_words_lower, out_file, 2)
    # pickle.dump(common_words_titlecase, out_file, 2)
    out_file.close()
开发者ID:lberezy,项目名称:LangComp,代码行数:35,代码来源:main.py


示例10: cal_idf

def cal_idf():
    # brown.sents()
    total_wordlists = []
    doc_sents = []
    for f in brown.fileids():
        print f
        doc_wordlist = []
        doc_sentlist = brown.sents(fileids=[f])
        d_sents = ''
        for sent in doc_sentlist:
            s = ''
            # sent = stem_tokens(sent)
            for w in sent:
                w = w.lower()
                s += w + ' '
            d_sents += s + '\n'
            doc_wordlist.extend(sent)
        total_wordlists.append(doc_wordlist)
        doc_sents.append(d_sents)
    print 'start caling tfidf'

    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = doc_sents
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
    dictionary = corpora.Dictionary(total_wordlists)
    dic, corps = get_corpus_by_lists(total_wordlists)
    tfidf = models.TfidfModel(corps, id2word=dic)
    pickle.dump(tfidf, open('brown_tfidf', 'w'))
开发者ID:JayveeHe,项目名称:OpinionRankProject,代码行数:32,代码来源:corpus_utils.py


示例11: auto_tag

def auto_tag(company):
    """
    tag a given text using brown corpus and unigram tagger
    :param company: company whose reviews are tagged
    :return: a list of tagged words
    """
    brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
    brown_sents = brown.sents(categories = 'news')

    # open the review of a company, and print error message if company review doesn't exist
    # first deal with unique cases such as General Motors => GM
    if company == 'General Motors':
        company = 'GM'
    elif company == 'Ford Motor Company':
        company = 'Ford'
    try:
        text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
    except FileNotFoundError:
        print('The system doesn\'t have a review for the company you entered. Please enter another company.')

    # normalize (tokenize and lowercase-ize) each word in the string
    text_token = nltk.word_tokenize(text)
    text_normal = [w.lower() for w in text_token]

    # build unigram tagger based on brown corpus, and use it to tag the normalized text
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    text_tagged = unigram_tagger.tag(text_normal)
    return text_tagged
开发者ID:vicher37,项目名称:jobchart,代码行数:28,代码来源:review_summary.py


示例12: update_category_by_pos

def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py


示例13: import_brown_pos

def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(brown.sents())
        counter = 0
    for category in brown.categories():
        cat_handle = ds.insert("#%s" % category)
        for sent in brown.tagged_sents(categories=category):
            if simplify_tags:
                norm = (simplify_tag(t) for t in sent)
            norm = [nltk.tuple2str(t) for t in norm]
            sen_handle = ds.insert(norm)
            ds.link(cat_handle, sen_handle)
            if not silent:
                counter += 1
                if (counter % 100 == 0):
                    print("importing %s of %s sentences..." % (counter, total), 
                        file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:26,代码来源:train.py


示例14: createModel

def createModel():
    global classifierit
    global classifierloose
    global classifieryou
    global classifierto
    global classifiertheir
    trainingitSet = []
    traininglooseSet = []
    trainingyouSet = []
    trainingtoSet = []
    trainingtheirSet= []
    st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
    for line in brown.sents():
        print line
        tagSent = st.tag(line)
        print tagSent
        arrayOfitFeature = pos_itfeatures(tagSent)
        arrayOfyouFeature = pos_youfeatures(tagSent)
        arrayOftheirFeature = pos_theirfeatures(tagSent)
        arrayOflooseFeature = pos_loosefeatures(tagSent)
        arrayOftoFeature = pos_tofeatures(tagSent)
        if arrayOfitFeature:
            trainingitSet.extend(arrayOfitFeature)
        if arrayOftheirFeature:
            trainingtheirSet.extend(arrayOftheirFeature)
        if arrayOflooseFeature:
            traininglooseSet.extend(arrayOflooseFeature)
        if arrayOftoFeature:
            trainingtoSet.extend(arrayOftoFeature)
        if arrayOfyouFeature:
            trainingyouSet.extend(arrayOfyouFeature)
        
    
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
    #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
    classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
    f = open('classifierit.pickle', 'wb')
    pickle.dump(classifierit, f)
    f.close()
    #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
    classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
    f = open('classifierloose.pickle', 'wb')
    pickle.dump(classifierloose, f)
    f.close()
    #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
    classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
    f = open('classifieryou.pickle', 'wb')
    pickle.dump(classifieryou, f)
    f.close()
    #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
    classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
    f = open('classifierto.pickle', 'wb')
    pickle.dump(classifierto, f)
    f.close()
    #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
    classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
    f = open('classifiertheir.pickle', 'wb')
    pickle.dump(classifiertheir, f)
    f.close()      
开发者ID:siddharthasandhu,项目名称:NLPProjects,代码行数:59,代码来源:stanLearn.py


示例15: get_valid_brown_corpus

def get_valid_brown_corpus():
    global DIR
    DIR = BROWN_DIR
    genre = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
    sentences = brown.sents(categories=genre)
    sents = remove_bad_sents(sentences)
    sents = [[w.lower() for w in s] for s in sents]
    return sents
开发者ID:eugenet12,项目名称:PoemGenerator,代码行数:8,代码来源:process_corpus.py


示例16: brown_tagged_sents

def brown_tagged_sents():
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    return (train_sents, brown_tagged_sents[size:])
开发者ID:atokop,项目名称:compling,代码行数:8,代码来源:u3.py


示例17: ic

def ic(w) :
	total = 0
	for sentence in b.sents():
		for word in sentence:
			total = total + 1
			brown_freqs[word.lower()] +=1
	
	print w.lower() ,":",brown_freqs[w.lower()], 1.0 - (math.log(brown_freqs[w.lower()]) / math.log(total+1))
开发者ID:dxd132630,项目名称:NeoPythonic,代码行数:8,代码来源:NLPProject.py


示例18: uG

def uG():
    global uniCounter   #counts repeats of uniGrams
    global uniGram      #dictionary of biGrams
    global uniGrams     #counts biGrams
    uniCounter = {}
    uniGram = []
    uniGrams = 0

        
    news = brown.sents(categories='editorial')
        
    for x in range (1, MAX, 1):
        
        sent = news[x]
        sent.append('</s>')    #ending sentences with '</s>'
        sent.insert(0, '<s>')  #beginning sentences with '<s>'
        
        for x in range (0,sent.count('.')+1,1):
            try:
                sent.remove('.')   #removing .'s
            except:
                pass
        for x in range (0,sent.count(',')+1,1):
            try:
                sent.remove(',')   #removing ,'s
            except:
                pass
        for x in range (0,sent.count("'")+1,1):
            try:
                sent.remove("'")   #removing ''s
            except:
                pass
        for x in range (0,sent.count('"')+1,1):
            try:
                sent.remove('"')   #removing ''s
            except:
                pass
        x = 0
        for word in sent:
            word = word.lower()  #making all letters lowercase
            sent[x] = word       #so differences dont occur when
            x = x+1              #they shouldn't

        value = '1'
        for x in range (0,len(sent),1):
            try:
                word = sent[x]
                if(word not in uniGram):
                    uniGram.append(word)
                    uniGrams = uniGrams + 1
                if (word in uniCounter):
                    value = uniCounter[word]
                    value = value + 1
                    uniCounter[word] = value
                else:
                    uniCounter[word] = 1
            except:
                    pass
开发者ID:cglennk,项目名称:nGrams,代码行数:58,代码来源:nGram.py


示例19: learn

 def learn(self, listofsentences=[], n=2000):
     self.learned = defaultdict(mydict)
     if listofsentences == []:
         listofsentences = brown.sents()
     for i, sent in enumerate(listofsentences):
         if i >= n:  # Limit to the first nth sentences of the corpus
             break
         for word in sent:
             self.learned[self.specialhash(word)][word.lower()] += 1
开发者ID:aminorex,项目名称:icsisumm,代码行数:9,代码来源:didyoumean.py


示例20: collect_data_from_ptb_brow_duc2004

def collect_data_from_ptb_brow_duc2004():

    start_collect = time.time()
    samples = []
    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)

    sys.stdout.write("Finish collecting training data from Penn Tree Bank")
    sys.stdout.flush()

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)
    sys.stdout.write("Finish collecting training data from Brown")
    sys.stdout.flush()

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name +"/"+ file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    samples.append(words)
            except:
                print "exception parse XML: ", file_name
                continue
    sys.stdout.write("Finish collecting training data from DUC2004")
    sys.stdout.flush()
    sys.stdout.write("length of samples" + str(len(samples)))
    sys.stdout.flush()
    end_collect = time.time()
    sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect))
    sys.stdout.flush()
    return samples
开发者ID:giahy2507,项目名称:convae,代码行数:57,代码来源:preparedata4convaewmpi.py



注:本文中的nltk.corpus.brown.sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python brown.tagged_sents函数代码示例发布时间:2022-05-27
下一篇:
Python brown.categories函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap