• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python util.ngrams函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.util.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了ngrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: modified_precision

    def modified_precision(candidate, references, n):
        candidate_ngrams=[]
        candidate_n = ngrams(candidate, n)
        
        for x in candidate_n:
            #print x
            candidate_ngrams.append(x)
        # print candidate_ngrams
        #print type(candidate_ngrams)
            #length+=1
            
        
        if len(candidate_ngrams) == 0:
            return 0
        
        #raw_input()
        c_words = set(candidate_ngrams)
        #print c_words
        for word in c_words:
            count_w = candidate_ngrams.count(word) + 1
            #print count_w

            count_max = 0
            for reference in references:
                reference_ngrams=[]
                reference_n = ngrams(reference, n)
                for x in reference_n:
                    reference_ngrams.append(x)

                count = reference_ngrams.count(word) + 1
                if count > count_max:
                    count_max = count

        return min(count_w, count_max) / (len(candidate) + len(c_words))
开发者ID:ab93,项目名称:Text-Summarization,代码行数:34,代码来源:evalSummary.py


示例2: getTrainData

def getTrainData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		for ng in all:
			datap.append([ng, label])
	X = np.zeros((len(datap), ngramsize, embedsize))
	Y = np.zeros((len(datap), 3))
	wildcard = np.array([0.0]*embedsize)
	for i in range(0, len(datap)):
		item = datap[i]
		ngram = item[0]
		label = item[1]
		vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
		labels = getLabels(label)
		X[i] = vectors
		Y[i] = labels
	return X, Y
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:28,代码来源:Run_NN_MLP.py


示例3: str_common_grams

 def str_common_grams(str1, str2, length=3):
     '''Return how many times the ngrams (of length min_len to max_len) of str1
     appeared on str2
     '''
     grams1 = list(ngrams(str1, length))
     grams2 = list(ngrams(str2, length))
     return sum(grams2.count(gram) for gram in grams1)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:7,代码来源:homedepot.py


示例4: getTestData

def getTestData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		datap.append(list(all))
	Xs = []
	wildcard = np.array([0.0]*embedsize)
	for ngs in datap:
		X = np.zeros((len(ngs), ngramsize, embedsize))
		for i in range(0, len(ngs)):
			ngram = ngs[i]
			vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
			X[i] = vectors
		Xs.append(X)
	return Xs
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:25,代码来源:Run_NN_MLP.py


示例5: extract_terms_features

    def extract_terms_features(terms, separateGrams=False):
        vector = dict()
        
        while('' in terms):
            terms.remove('')
#        for term in terms:
#            if vector.has_key(term):
#                vector[term] += 1
#            else:
#                vector[term] = 1
#        for i in range(len(terms) - 2):
#            cb2 = ' '.join(terms[i:i+1])
#            cb3 = ' '.join(terms[i:i+2])
#            if vector.has_key(cb2):
#                vector[cb2] += 1
#            else:
#                vector[cb2] = 1
#            if vector.has_key(cb3):
#                vector[cb3] += 1
#            else:
#                vector[cb3] = 1
#        cb2 = ' '.join(terms[len(terms)-2:len(terms)])
#        if vector.has_key(cb2):
#            vector[cb2] += 1
#        else:
#            vector[cb2] = 1
#        print terms
        g2 = ngrams(terms, 2)
        g3 = ngrams(terms, 3)
        
        
        g2j = [' '.join(gterms) for gterms in g2]
        g3j = [' '.join(gterms) for gterms in g3]
        
        
        vec1 = {}
        vec2 = {}
        vec3 = {}
        
        for t in terms:
            if(not vector.has_key(t)):
                vec1[t] = 1
            else:
                vec1[t] += 1
        for t in g2j:
            if(not vector.has_key(t)):
                vec2[t] = 1
            else:
                vec2[t] += 1
        for t in g3j:
            if(not vector.has_key(t)):
                vec3[t] = 1
            else:
                vec3[t] += 1
        
        vector = dict(vec1.items() + vec2.items() + vec3.items())
        if(separateGrams == True):
            return (vector, vec1, vec2, vec3)
        else:
            return vector
开发者ID:klyc0k,项目名称:EDSFilter,代码行数:60,代码来源:twitter_methods.py


示例6: format_text

def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
开发者ID:mit-teaching-systems-lab,项目名称:threeflows,代码行数:60,代码来源:calculate_emotion.py


示例7: getNgramProbs

def getNgramProbs(file):
	f = open(file,'r');
	unigramList = [] ;
	for line in f.read().split():
		unigramList.append( line );

	bigramList = ngrams(unigramList, 2);
	trigramList = ngrams(unigramList, 3);

	#dictionary of unigrams, bigrams, trigrams
	unigramDict = dict()
	bigramDict = dict()
	trigramDict = dict()

	#Counts for Unigrams
	countUni = 0 ;
	for item in unigramList:
		countUni += 1
		if item not in unigramDict:
			unigramDict[item] = 1
		else:
			unigramDict[item] += 1

	#Counts for Bigram
	for item in bigramList:
		if item not in bigramDict:
			bigramDict[item] = 1
		else:
			bigramDict[item] += 1

	#Counts for Trigrams
	for item in trigramList:
		if item not in trigramDict:
			trigramDict[item] = 1
		else:
			trigramDict[item] += 1

	#Probabilities for Trigrams
	for key,item in trigramDict.iteritems():
		trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ; 

	#Probabilities for Bigrams
	for key,item in bigramDict.iteritems():
		bigramDict[key] /= float(unigramDict[key[0]]) ; 

	#Probabilities for Unigrams
	for key,item in unigramDict.iteritems():
		unigramDict[key] /= float(countUni) ; 

	# print "***** Unigrams";
	# for key,item in unigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Bigrams";
	# for key,item in bigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Trigrams";
	# for key,item in trigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	
	return [unigramDict,bigramDict,trigramDict];
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:60,代码来源:kldiv.py


示例8: scoreScopeOverlap

	def scoreScopeOverlap(self,scopeHyp,scopeRef):
		
		totalScore = 0

		for scope_h in scopeHyp:
			bestScore = 0
			for scope_r in scopeRef:

				if scope_r==[] or scope_h==[]:
					partialScore = 0
					if partialScore > bestScore: bestScore = partialScore
				else:
					ngram_range=range(1,len(scope_h)+1)
					logging.info("ngram_range")
					logging.info(ngram_range)
					score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range)
					logging.info(score_weights)
				
					partialScore=float()
					for i in ngram_range:
						hyp=ngrams(scope_h,i)
						ref=ngrams(scope_r,i)
						partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1])
					logging.info("partialScore")
					logging.info(partialScore)
					if partialScore > bestScore: bestScore = partialScore

			totalScore+=bestScore
			logging.info("totalScore")
			logging.info(totalScore)
			
		return totalScore
开发者ID:wilkeraziz,项目名称:chisel-features,代码行数:32,代码来源:main.py


示例9: create_candidate_list

def create_candidate_list(sentence):
    tokens = nltk.tokenize.word_tokenize(sentence)

    candidates_lists = create_candidates_lists(tokens)

    # Create list of 1-grams.
    candidates = []
    for l in candidates_lists:
        candidates += l

    # Remove irrelevant stop words in 1-grams.
    res = [token for token in candidates
        if token not in ENGLISH_STOPWORDS]

    # Create list of bigrams.
    bigrams = []
    for l in candidates_lists:
        bigrams += ngrams(l, 2)

    # Create list of trigrams.
    trigrams = []
    for l in candidates_lists:
        trigrams += ngrams(l, 3)

    # Create list of 4-grams.
    fourgrams = []
    for l in candidates_lists:
        fourgrams += ngrams(l, 4)

    res += [' '.join(a) for a in bigrams]
    res += [' '.join(a) for a in trigrams]
    res += [' '.join(a) for a in fourgrams]

    return res
开发者ID:srom,项目名称:ensu,代码行数:34,代码来源:select_aliases.py


示例10: calc_ngram

def calc_ngram(htokens,etokens):
    features = []
    for n in range(1,5):
        hgrams = nltk.FreqDist(ngrams(htokens,n))
        egrams = nltk.FreqDist(ngrams(etokens,n))
        prec = 0
        num = 0
        for k in hgrams:
            if k in egrams:
                prec = prec + hgrams[k]
            num = num + hgrams[k]
        if num > 0:
            prec = float(prec) / num
        features.append(prec)
        recall = 0
        num = 0
        for k in egrams:
            if k in hgrams:
                recall = recall + egrams[k]
            num = num + egrams[k]
        if num > 0:
            recall = float(recall) / num
        features.append(recall)
        features.append(calc_f1(prec,recall))
    return features
开发者ID:da03,项目名称:sp2016.11-731,代码行数:25,代码来源:generate_feature.py


示例11: rouge_s

    def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):

        rouge_s_list = []
        k_c = len(candidate) if d_skip is None else d_skip
        cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
                              n=2, k=k_c))
        for ref in references:
            k_ref = len(ref) if d_skip is None else d_skip
            ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
                                 n=2, k=k_ref))
            count = 0
            for bigram in cand_skip_list:
                if bigram in ref_skip_list:
                    count = count+1
            if not smoothing:
                r_skip = count/len(ref_skip_list)
                p_skip = count/len(cand_skip_list)
            else:
                cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
                                      n=1))
                ref_ungm = list(ngrams(tokenizer.tokenize(ref),
                                     n=1))
                for ungm in cand_ungm:
                    if ungm in ref_ungm:
                        count += 1
                r_skip = count/(len(ref_skip_list)+len(ref_ungm))
                p_skip = count/(len(cand_skip_list)+len(cand_ungm))
            score = Rouge.get_score(r_skip, p_skip, beta)           
            rouge_s_list.append(score)
        return Rouge.jacknifing(rouge_s_list, averaging=averaging)
开发者ID:53X,项目名称:NLP-Metrics,代码行数:30,代码来源:rouge.py


示例12: char_ngram_similarity

def char_ngram_similarity(doc1, doc2, n, top=100):
    """
    Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
    If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
    :param doc1:
    :param doc2:
    :param n: the n-gram length
    :param top: Only use the N most frequent n-grams from each document.
    :return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
             character n-grams distribution.)
    """

    ngrams1 = Counter(ngrams(doc1, n))
    ngrams2 = Counter(ngrams(doc2, n))

    profile1 = [n[0] for n in ngrams1.most_common(top)]
    profile2 = [n[0] for n in ngrams2.most_common(top)]

    # normalise the two ngram distributions
    total1 = np.sum(list(ngrams1.values()))
    for key in ngrams1:
        ngrams1[key] /= total1

    total2 = np.sum(list(ngrams2.values()))
    for key in ngrams2:
        ngrams2[key] /= total2

    # calculate global dissimilarity score
    score = 0
    for n in set(profile1 + profile2):
        f1 = ngrams1[n]
        f2 = ngrams2[n]
        score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
    return score
开发者ID:rug-compling,项目名称:glad,代码行数:34,代码来源:glad-main.py


示例13: jaccardIdx

def jaccardIdx(w1, w2):
    w1ngrams = set(ngrams(w1, 2))
    w2ngrams = set(ngrams(w2, 2))

    union = w1ngrams.union(w2ngrams)
    intersect = w1ngrams.intersection(w2ngrams)

    return 1.0 - float(len(intersect)) / float(len(union))
开发者ID:weezel,项目名称:ITIS13,代码行数:8,代码来源:russiannames.py


示例14: count_word

    def count_word(self,doc,unigram = True,bigram = False,binary = False):
        str = word_tokenize(self.remove_non_ascii(doc))
        doc_voc = {}
        if(unigram):
            uni = ngrams(str,1)
            self.count_word_sub(doc_voc,uni,binary)

        if(bigram):
            bi = ngrams(str,2)
            self.count_word_sub(doc_voc,bi,binary)
开发者ID:akshaynavada,项目名称:NLP,代码行数:10,代码来源:NB.py


示例15: trainModel

	def trainModel(self, listOfFilenames):
		#dictionary of unigrams, bigrams, trigrams
		unigramDict = dict()
		bigramDict = dict()
		trigramDict = dict()

		#total count of unigrams, bigrams, trigrams
		countUni = 0
		countBi = 0
		countTri = 0

		i = 1
		#iterate over list of files
		for fileName in listOfFilenames:
			print "Reading", i
			i += 1
			stag = STagger(fileName)
			stag.find_unigrams(True, False)
			for item in stag.unigrams:
				countUni += 1
				if item not in unigramDict:
					unigramDict[item] = 1
				else:
					unigramDict[item] += 1
			codeBigrams = ngrams(stag.unigrams, 2)
			codeTrigrams = ngrams(stag.unigrams, 3)
			for item in codeBigrams:
				countBi += 1
				if item not in bigramDict:
					bigramDict[item] = 1
				else:
					bigramDict[item] += 1
			for item in codeTrigrams:
				countTri += 1
				if item not in trigramDict:
					trigramDict[item] = 1
				else:
					trigramDict[item] += 1

		
		#write the ngrams to the file
		outputFile = open('corpus.txt', 'w')
		outputFile.write(str(countUni) + "\n")
		for key, x in unigramDict.iteritems():
			outputFile.write(str(key) + " " + str(x) + "\n")

		outputFile.write(str(countBi) + "\n")
		for key, x in bigramDict.iteritems():
			outputFile.write(str(key[0]) + " "  + str(key[1]) + " " + str(x) + "\n")

		outputFile.write(str(countTri) + "\n")
		for key, x in trigramDict.iteritems():
			outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n")

		outputFile.close()
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:55,代码来源:extract2.py


示例16: ngram_similarity

def ngram_similarity(str1, str2, n = 3):
    str1 = str1.split()
    str2 = str2.split()
    ngram1 = []
    ngram2 = []
    for i in range(n):
        ngram1 = ngram1 + list(ngrams(str1,n-i))
    
    for i in range(n):
        ngram2 = ngram2 + list(ngrams(str2,n-i))
    return jaccard_dis(set(ngram1),set(ngram2))
开发者ID:kunrenzhilu,项目名称:dmproject_updated,代码行数:11,代码来源:random_forestscript_v2.py


示例17: get_ngrams

 def get_ngrams(self, tokens):
     tokens.insert(0, '<START>')
     unigrams = ngrams(tokens,1)
     # key for unigrams is ('word',), not just 'word' string.
     for item in unigrams: self.fdist1[item] += 1 
     
     bigrams = ngrams(tokens,2)
     for item in bigrams: self.fdist2[item] += 1 
     
     trigrams = ngrams(tokens,3)
     for item in trigrams: self.fdist3[item] += 1 
开发者ID:ylmeng,项目名称:generate_captions,代码行数:11,代码来源:getNgrams.py


示例18: modified_precision

    def modified_precision(candidate, references, n):
        """ Calculate modified ngram precision.

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=1,
        ... )
        0.28...

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=2,
        ... )
        0.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=1,
        ... )
        1.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=2,
        ... )
        1.0

        """
        counts = Counter(ngrams(candidate, n))

        if not counts:
            return 0

        max_counts = {}
        for reference in references:
            reference_counts = Counter(ngrams(reference, n))
            for ngram in counts:
                max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

        clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

        return sum(clipped_counts.values()) / sum(counts.values())
开发者ID:52nlp,项目名称:Text-Summarization,代码行数:54,代码来源:bleu.py


示例19: uni_bi_gram

 def uni_bi_gram(self,doc,unigram,bigram):
     ret_list = []
     if(unigram):
         uni = ngrams(doc,1)
         for gram in uni:
             ret_list.append(gram)
     if(bigram):
         bi = ngrams(doc,2)
         for gram in bi:
             ret_list.append(gram)
     return ret_list
开发者ID:akshaynavada,项目名称:NLP,代码行数:11,代码来源:NB.py


示例20: create_model

def create_model(tokenized_data):
    tokens_list = [tokens  for ndata in tokenized_data for tokens in ndata]   
    cfreq_data_bigram = nltk.ConditionalFreqDist(nltk.bigrams((tokens_list)))
    n = 3
    trigrams = ngrams(tokens_list, n)
    z = 4
    m = 5
    n = 6
    fourgram = ngrams(tokens_list, z)
    fivegram = ngrams(tokens_list, m)
    sixgram = ngrams(tokens_list, n)
    return cfreq_data_bigram,trigrams,fourgram,fivegram,sixgram
开发者ID:nusebac,项目名称:Akosha,代码行数:12,代码来源:Main.py



注:本文中的nltk.util.ngrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python util.tokenwrap函数代码示例发布时间:2022-05-27
下一篇:
Python util.in_idle函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap