• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python nltk.pos_tag函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了pos_tag函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: create_synonyms

def create_synonyms(orig_word):
    '''
    funation for creating synonyms by passing word
    '''
    try:
        headers = {
            "X-Mashape-Key": "aIder4iWr4msh5Scn073WRoddmAEp1qA0I3jsnSR8lfJwtyzpg",
            "Accept": "application/json"}

        response = requests.get("https://wordsapiv1.p.mashape.com/words/{}/synonyms".format(orig_word), headers=headers)
        if response.status_code == 200:
            json = response.json()
            synonyms = json['synonyms']
            # synonyms = nltk.word_tokenize(synonyms)
            synonyms = nltk.pos_tag(synonyms)
            word = nltk.word_tokenize(orig_word)
            word = nltk.pos_tag(word)[0]
            print(synonyms)
            good_syns = []
            for syn in synonyms:
                print(word[1], syn[1])
                if word[1] == syn[1]:
                    print('*')
                    good_syns.append(syn[0])
            word = Word.objects.get_or_create(word=orig_word)            
            for syn in good_syns[:2]:
                try:
                    new_word = Word.objects.create(word=syn.lower(), is_synonym=True)
                except Exception:
                    new_word = Word.objects.get(word=word)
                syn = Synonym.objects.create(word=new_word)
                syn.synonym_to.add(word)
            return good_syns
    except Exception as e:
        print(e)
开发者ID:Dambre,项目名称:social_advisor,代码行数:35,代码来源:dictionary.py


示例2: extract_pos_pair

def extract_pos_pair(event_mention_1, event_mention_2):
    trigger1=""
    extent1=""
    trigger2=""
    extent2=""
    for one_anchor in event_mention_1.findall("anchor"):
        trigger1=one_anchor[0].text
    for one_anchor in event_mention_2.findall("anchor"):
        trigger2=one_anchor[0].text
    for one_extent in event_mention_1.findall("extent"):
        extent1=one_extent[0].text
    for one_extent in event_mention_2.findall("extent"):
        extent2=one_extent[0].text
    text1 = nltk.word_tokenize(extent1)
    dict1 = nltk.pos_tag(text1)
    for one_pair in dict1:
        if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
            pos1=one_pair[1]
            break
    text2 = nltk.word_tokenize(extent2)
    dict2 = nltk.pos_tag(text2)
    for one_pair in dict2:
        if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
            pos2=one_pair[1]
            break
    return (pos1, pos2)
开发者ID:wtl-zju,项目名称:KBP2015,代码行数:26,代码来源:coref_feature_extraction.py


示例3: writeOut

def writeOut(lsummary_out, allwordsphrases=[],  outputpath='.', gridset=''):    
 
    # Write data out for the last folder (gridset) encountered - MUST BE A BETTER WAY THAN THIS?
    uWordsPhrases = uniqueSet(allwordsphrases)              # Set of unique words.
    uwords =[]
    uphrases = []
    words = []
    phrases =[]
    wordtypes =[]
    wordtypes =[]
    total_wordsphrases = total_uwordsphrases = total_words = total_phrases = 0

    ldata_out = UnicodeWriter(open(outputpath + '/'+ gridset +'/language-data.csv', 'wb'), delimiter=',', quotechar='"')
    ldata_out.writerow(["WORD", "NUMBER OF WORDS", "COUNT", "TYPE"])
    
   # Output metrics  to file.
    for item in uWordsPhrases:
       num_words = len(item.split())
       item_count = allwordsphrases.count(item)
       if num_words == 1:                          # Single word
          word_type = nltk.pos_tag(item)[-1][-1]
          #word_type_help = nltk.help.upenn_tagset(word_type)
# MAYBE CONVERT TAGS INTO MORE USEFUL WORDS?!
          ldata_out.writerow([item, str(num_words), str(item_count), word_type])
          uwords.append(item)
          wordtypes.append(word_type)
       elif num_words > 1:                         # Phrase
          nltk_words = nltk.word_tokenize(item)
          word_pos = nltk.pos_tag(nltk_words) ### HOW TO DEAL WITH PHRASES???
          word_types = [x[1] for x in word_pos]
          ldata_out.writerow([item, str(num_words), str(item_count), " ,".join(word_types)])
# HOW TO OUTPUT EACH POS TO A COLUMN???
          uphrases.append(item)

    for item in allwordsphrases:
        num_words = len(item.split())
        if num_words == 1:
            words.append(item)
        elif num_words > 1:
            phrases.append(item)
        
    uword_types = countDuplicatesInList(wordtypes)
    
    total_wordsphrases = len(allwordsphrases)
    total_uwordsphrases = len(uWordsPhrases)
    total_uwords = len(uwords)
    total_uphrases = len(uphrases)

    total_words = len(words)
    total_phrases = len(phrases)
    
    #["File Name", "Total Words or Phrases", "Total Unique Words or Phrases", "Total Words", "Total Phrases", "Total Unique Words", "Total Unique Phrases", "Types of Word"])
    lsummary_out.writerow([gridset, str(total_wordsphrases), str(total_uwordsphrases), str(total_words), str(total_phrases), str(total_uwords), str(total_uphrases), ', '.join(map(str, uword_types))])

    raw_words_out = open(outputpath + '/'+ gridset +'/raw-unique-words.text', 'wb')
    raw_words_out.writelines('\n'.join(uWordsPhrases).encode('utf-8'))
    raw_phrases_out = open(outputpath + '/'+ gridset +'/raw-unique-phrases.txt', 'wb')
    raw_phrases_out.writelines('\n'.join(uphrases).encode('utf-8'))
    raw_words_out = open(outputpath + '/'+ gridset +'/raw-wordsphrases.text', 'wb')
    raw_words_out.writelines('\n'.join(allwordsphrases).encode('utf-8'))
开发者ID:simonjudge,项目名称:AAC-Tools,代码行数:60,代码来源:wordlistMetrics.py


示例4: nltk_filter

def nltk_filter(sent):
  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1            = b1.lower()
  tokens        = word_tokenize(b1)
  pos_tags      = pos_tag(tokens)
  filtered_sent = ' '
  for token in tokens:
    filtered_sent += '1'+token + ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

#note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2            = b2.lower()
  tokens        = word_tokenize(b2)
  pos_tags      = pos_tag(tokens)
  # filtered_sent = ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  for token in tokens:
    filtered_sent += '2' + token + ' '

  return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:30,代码来源:builder.py


示例5: load_data

def load_data(path):
    sentences_pos = []
    r1 = re.compile(r'\<([^ ]+)\>')
    r2 = re.compile(r'\$US(\d)')
    for l in open(path):
        if not l.strip():
            continue
        l = l.decode('utf-8')
        l = l.replace(u'’', "'")
        l = l.replace(u'``', '"')
        l = l.replace(u"''", '"')
        l = l.replace(u"—", '--')
        l = l.replace(u"–", '--')
        l = l.replace(u"´", "'")
        l = l.replace(u"-", " ")
        l = l.replace(u"/", " ")
        l = r1.sub(r'\1', l)
        l = r2.sub(r'$\1', l)
        s = l.strip().split('\t')
        sa, sb = tuple(nltk.word_tokenize(s)
                          for s in l.strip().split('\t') if s) # ignore double \t
        sa, sb = ([x.encode('utf-8') for x in sa],
                  [x.encode('utf-8') for x in sb])

        for s in (sa, sb):
            for i in xrange(len(s)):
                if s[i] == "n't":
                    s[i] = "not"
                elif s[i] == "'m":
                    s[i] = "am"
        sa, sb = fix_compounds(sa, sb), fix_compounds(sb, sa)
        sentences_pos.append((nltk.pos_tag(sa), nltk.pos_tag(sb)))
    return sentences_pos
开发者ID:STS-NTNU,项目名称:STS13,代码行数:33,代码来源:simpfeats.py


示例6: replace_proper_nouns

    def replace_proper_nouns(self, o_sent, n_sent):
        proper_nouns = []
        p_pnouns = []

        o_tagged = pos_tag(word_tokenize(o_sent))
        n_tagged = pos_tag(word_tokenize(n_sent))
        # print("\nTransforming the output:")
        # print("Input sentence:", o_sent)
        # print("Found sentence:", n_sent)
        # print("Input sentence tagged:", o_tagged)
        # print("Found sentence tagged:", n_tagged)

        for o in o_tagged:
            if o[1] == 'NNP' and o not in proper_nouns:
                proper_nouns.append(o)

        for n in n_tagged:
            if (n[1] == 'PRP' or n[1] == 'PRP$' or n[1] == 'NNP') and n not in p_pnouns:
                p_pnouns.append(n)

        # print("")

        if (len(proper_nouns) == 1) and (len(p_pnouns) > 0):
            n_sent = sub(r"\b%s\b" %p_pnouns[0][0] , proper_nouns[0][0], n_sent, 1)
            gender = self.gp.classify(proper_nouns[0][0])
            # print(proper_nouns[0][0], "is classified as", gender)
            for pnoun in p_pnouns:
                n_pnoun = self.change_gender(pnoun[0], gender)
                n_sent = sub(r"\b%s\b" %pnoun[0] , n_pnoun, n_sent)
        elif len(proper_nouns) < 1:
            print("No proper nouns to replace")
        else:
            print("Not yet implemented, :P")

        return n_sent
开发者ID:theopak,项目名称:storytellingbot,代码行数:35,代码来源:Extrapolate.py


示例7: normalize_word

def normalize_word(word, lowercase=True, lemmatize=True):
    "Normalize word by stripping plural nouns"
    global NORMWORD_CACHE
    global NORMWORD_POS
    if NORMWORD_WNL is None:
        init_normword_wnl()
    if lowercase:
        word = word.lower()
    if word in NORMWORD_CACHE:
        return NORMWORD_CACHE[word]
    if not lemmatize:
        return word
    treebank_tag = nltk.pos_tag([word])[0][1]
    newword = word
    if ( len(newword) > 4 ) and ( treebank_tag == 'NNS' ):
        #  Only lemmatize plural nouns, leave verbs alone
        wnpos = get_wordnet_pos(treebank_tag)
        if wnpos:
            newword = NORMWORD_WNL.lemmatize(newword, wnpos)
        if newword != word:
            LOGGER.debug('Changing %s to %s' % (word, newword))
        NORMWORD_POS[newword] = nltk.pos_tag([newword])[0][1]
    else:
        NORMWORD_POS[word] = treebank_tag
    NORMWORD_CACHE[word] = newword
    return newword
开发者ID:markgraves,项目名称:sanal,代码行数:26,代码来源:sautil.py


示例8: test_nltkNERParsing

    def test_nltkNERParsing(self):
        testString = 'Natural Sciences and Engineering Research Council of Canada'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        getGPEs = []

        for treeBranch in chunked:
            if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
                getGPEs.append(str(treeBranch))

        self.assertEqual(1, len(getGPEs))

        testString = 'Milwaukee Foundation'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
开发者ID:kyajmiller,项目名称:Cerebro,代码行数:29,代码来源:TestClassifyBadScholarships.py


示例9: printer

def printer(sentencescorelist, sentenceList, wordscorelist, wordList):
    outFile = open('./tldr/outFile.txt', 'w')
    for s in range(0, len(sentenceList)):
        if s in sentencescorelist:
            printsentence(sentenceList[s], outFile)
    outFile.write("Topics to research: ")

    topics = []
    numtopics = 3
    poswords = nltk.pos_tag(wordList)
    poskeep = ["NN", "NNS", "NNP", "NNPS"]

    while numtopics > 0:
        temp = max(wordscorelist.iteritems(), key=operator.itemgetter(1))[0]
        templist = [temp]
        templist = nltk.pos_tag(templist)
        if templist[0][1] in poskeep:
            numtopics -= 1
            topics.append(temp)
        del wordscorelist[temp]
    for i in range(0, len(topics)):
        if i != len(topics) - 1:
            outFile.write(topics[i] + ", ")
        else:
            outFile.write(topics[i])
    outFile.close()
开发者ID:fernandest,项目名称:TLDR_Twist,代码行数:26,代码来源:main.py


示例10: parse_stock_name

    def parse_stock_name(self, stockname):
        p = engine()

        instruction_set = stockname.split(',')
        word_list = instruction_set[0].split(' ')
        index = 1
        categories_ignored = ['RB', 'TO']
        tokens = word_tokenize(instruction_set[0])
        tags = pos_tag(tokens)
        i=0
        while i < len(tags):
            if tags[i][1] in categories_ignored:
                index += 1
                i+= 1
            else:
                break

        quantity = word_list[index-1]
        disallowed = ['g', 'ml', 'x', 'kg', 'cups', 'cup', 'grams', 'can', 'tbsp', 'tsp', 'tbsps', 'tsps',
                 'small', 'bunch', 'piece', 'handful', 'pack', 'chopped', 'large', 'a', 'pinch',
                 'fresh', 'dried', 'heaped', 'thick', 'slices', 'slice', 'of', 'about']
        while index < len(word_list):
            if word_list[index] not in disallowed:
                break
            else:
                index+=1
        sentence = " ".join(word_list[index:])
        tokens = word_tokenize(sentence)
        categories = pos_tag(tokens)
        words = []
        for category in categories:
            if category[1] not in ['NNS', 'VBN', 'VBG']:
                words.append(category[0])
        word = " ".join(words)
        return quantity, word, None
开发者ID:Godley,项目名称:MealPlanner,代码行数:35,代码来源:pipelines.py


示例11: test

def test(ws,wf,s,pf,wm,alfa2):
    f1=open('test_data.data','rb')
    f2=open('test.csv','rb')
    val_text=f1.read()
    comt=f2.read().splitlines()
    val_lines=val_text.splitlines()
    acc=0
    lc=0
    for line in val_lines:
        token = line.split(' | ')
        token[2]="<S> "+token[2]+" <E>"
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S> ":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!=" <E>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<E>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff,alfa2)
        if val_label==comt[lc].split(",")[1]:
            acc+=1
        lc+=1
    print float(acc)/len(val_lines)
    f1.close()
    f2.close()
开发者ID:saumyakb,项目名称:CS4740-NLP-Superwised-WSD,代码行数:27,代码来源:trainer2.py


示例12: m_surrounding

 def m_surrounding(self):
    D = {}
    sent = self.sentence["form"]
    l = len(sent)
    #print sent 
    K = self.index
    '''
    for k in range(l):
        if sent[k] == self.word:
            K = k
            break
    '''
    #print K, l
    tagp = tagn = ""
    if (K+1) < l:
        tagn = nt.word_tokenize(sent[K+1])
        tagn = nt.pos_tag(tagn)     
    if (K-1) >=0:
        tagp = nt.word_tokenize(sent[K-1])
        tagp = nt.pos_tag(tagp)        
        
    if tagp != "":
        D["ptag"] = tagp[0][1]
    else: 
        D["ptag"] = ""
    if tagn != "":    
        D["ntag"] = tagn[0][1]
    else:
        D["ntag"] = ""
        
    print D
    return D 
开发者ID:korlev91,项目名称:CWI---complex-word,代码行数:32,代码来源:WordFeatures.py


示例13: score_glove_pos

def score_glove_pos(src, dst, numpy_arrays, labels_array, g, normalize=True):
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	b1_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b1]
	b2_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b2]

	res = []
	for i in range(lines):
		tags1 = [tag[0] for tag in b1_pos[i] if tag[1] in NOUN]
		tags2 = [tag[0] for tag in b2_pos[i] if tag[1] in NOUN]
		r = [1 - spatial.distance.cosine(g[tag1], g[tag2]) for tag1 in tags1 for tag2 in tags2 if tag1 in labels_array and tag2 in labels_array]
		if len(r) == 0:
			res.append(0)
		else:
			res.append(round(5*max(r), 2))

	if normalize:
		res = normarlize_score(res)
			
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
	print src + ' finished!'
开发者ID:wintor12,项目名称:SemEval2015,代码行数:30,代码来源:run.py


示例14: test

def test(ws,wf,s,pf):
    f1=open('validation_data.data','rb')
    #f2=open('test_data.csv','w')
    val_text=f1.read()
    val_lines=val_text.splitlines()
    acc=0

    for line in val_lines:
        token = line.split(' | ')
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S>":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!="<\S>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<\S>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff)
        #f2.write(token[0]+" | "+val_label+" | "+token[2])
    #f1.close()
    #f2.close()
    #print "Done"
    
        

        if val_label==token[1]:
            acc+=1
    print float(acc)/len(val_lines)
开发者ID:saumyakb,项目名称:CS4740-NLP-Superwised-WSD,代码行数:29,代码来源:MakeDictKaggle.py


示例15: expand_with_wordnet

def expand_with_wordnet(query):
    """
    This function expands every contentful word in the query with its wordnet
    definition. The word itself is not removed. Stop words are removed from the
    word definition as well.
    (Contentful means that it is not a stopword or punctuation sign)

    INPUT:
        query   --  user query that is a simple string
    OUTPUT:
        expanded_query  --  user query + definitions of contentful words
    """
    stop = stopwords.words("english")
    stop += EXCLUDED
    contentful_tokens = [tok for tok in query.split() if tok not in stop]
    # take the first definition for the current word
    defs = []
    for token in contentful_tokens:
        syn1 = wn.synsets(token, pos=wn.ADJ)[:1]
        syn2 = wn.synsets(token, pos=wn.NOUN)[:1]
        # we take into account only adj defs
        if syn1:
            defs.append(token)
            def_tokenized = word_tokenize(syn1[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
        elif syn2:
            defs.append(token)
            def_tokenized = word_tokenize(syn2[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
    # expansion can add some EXCLUDED words back in the query
    defs = set(defs) - set(EXCLUDED)  # removing again
    expanded = " ".join(defs)
    return expanded
开发者ID:tastyminerals,项目名称:cocktail_bot,代码行数:33,代码来源:cocktail_ir.py


示例16: extract_entities2

def extract_entities2(text):
	entities = []
	
	"""t0 = nltk.DefaultTagger('NN')
	t1 = nltk.UnigramTagger(train_sents, backoff=t0)
	t2 = nltk.BigramTagger(train_sents, backoff=t1)
	t2.evaluate(test_sents)"""
	
	for sentence in sent_tokenize(text):
	    #print pos_tag(nltk.word_tokenize(sentence))
	    print sentence
	    tags=pos_tag(nltk.word_tokenize(sentence))
	    tags=tagear(tags)
	    chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence)))
	    #chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text))))
	    chunks = ne_chunk(tags)
	    #chunks.draw()
	    #print chunks
	    for chunk in chunks:
	    	#print chunk
	    	#if hasattr(chunk, 'node'):
	    	#	print chunk.node
	    	if hasattr(chunk, 'node') :
	    		print chunk	
	    		entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
	return entities
开发者ID:jholoc,项目名称:proyectoScrapy,代码行数:26,代码来源:Tokenizacion.py


示例17: tokenizeme

 def tokenizeme(self, LanguageSample):
     self.tokenized_text=nltk.word_tokenize(LanguageSample)
     self.unique_words=list(set(self.tokenized_text))
     self.unique_words.sort()
     self.unique_words=nltk.pos_tag(self.unique_words) #Unique words does not get rid of infectional morpheme duplicates
     self.tagged_text = [i for i in nltk.pos_tag(self.tokenized_text) if i[1]!="."] #pos_tag gets the part of speech, loop removes punctuation
     self.count = len(self.tagged_text)
开发者ID:theredwillow,项目名称:SLP_Assessment,代码行数:7,代码来源:NLTK_Info.py


示例18: make_pos

        def make_pos(target_tag, edit_rev):
            tags, srcs, dsts = edit_rev

            sentence = ''

            if target_tag == del_tag:
                sentence = dsts
            elif target_tag == add_tag:
                sentence = srcs

            if target_tag in tags:
                tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
                trimed = sentence
                for tag_index in tag_indexes:
                    trimed = trimed[:tag_index] + trimed[tag_index+1:]

                posed = pos_tag(trimed)
                pos = [w[1] for w in posed]
                for tag_index in tag_indexes:
                    pos.insert(tag_index, u'')

                # debug
                none_indexes = [i for i, x in enumerate(pos) if x == u'']
                if tag_indexes != none_indexes:
                    print(tag_indexes, file=sys.stderr)
                    print(none_indexes, file=sys.stderr)
                    print(tags, file=sys.stderr)
                    print(pos, file=sys.stderr)
            else:
                posed = pos_tag(u' '.join(sentence).split())
                pos = [w[1] for w in posed]

            return pos
开发者ID:tkyf,项目名称:epair,代码行数:33,代码来源:englishword_edit_distance.py


示例19: glv_window_overlap

def glv_window_overlap(t1, t2, n = 5):
    ''' Looks for an alignment within the window between sentences
        (non-overlapping within the sentence) and words
        with compatible lemmas POS.  Emits features regarding the distance between common words, and
        finds the glv vector difference between pos-tag aligned words,
        inversely weighted by sentence distance. '''
        
    ''' Looks within a window of influence around word matches for context, and compares the glove 
        vectors within the (n - 1) gram context.  Produces dim * (n - 1) dense features.'''

    features = Counter()
    v_tagged = pos_tag(leaves(t1))
    w_tagged = pos_tag(leaves(t2))

    for v in ntuples(v_tagged, n):
        for w in ntuples(w_tagged, n):
            # Find alignment
            alignments = find_exact_alignments(v, w)
            for i, j in alignments:
                ''' Featurize the word alignment in the window '''  
                features[v[i][0] + str(i - j) ] += 1
            if not alignments:
                continue
            else:
                similar_align = find_tagged_alignments(v, w, alignments)
                for i, j in similar_align:
                    word_diff = np.exp ( glvvec( v[i][0]) - glvvec( w[j][0]) ) 
                    
                    for dim in range(word_diff.shape[0]): 
                        features[ v[i][1] + ' aligned dim ' +  str(dim)] += word_diff[dim]

    return features
开发者ID:BinbinBian,项目名称:224UProject,代码行数:32,代码来源:features.py


示例20: text_to_pos_list

def text_to_pos_list(lst):
    dpos_list = []
    tpos_list = []
    for line in lst:
        if "IsTruthFul" in line:
            continue
        else:
            if line[0] == "0": #If deceptive:
                dpos_list.append("<r>")
                for sent in nltk.tokenize.sent_tokenize(parse_line(line)):
                    dpos_list.append("<s>")
                    text = nltk.word_tokenize(sent)
                    tagged = nltk.pos_tag(text)
                    for t in tagged:
                        dpos_list.append(t)
                    dpos_list.append("</s>")
                dpos_list.append("</r>")
            else:
                tpos_list.append("<r>")
                for sent in nltk.tokenize.sent_tokenize(parse_line(line)):
                    tpos_list.append("<s>")
                    text = nltk.word_tokenize(sent)
                    tagged = nltk.pos_tag(text)
                    for t in tagged:
                        tpos_list.append(t)
                    tpos_list.append("</s>")
                tpos_list.append("</r>")
    return (dpos_list, tpos_list)
开发者ID:cheelan,项目名称:NLP,代码行数:28,代码来源:deception.py



注:本文中的nltk.pos_tag函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.regexp_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.parse_cfg函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap