• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python nltk.word_tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了word_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: load_file_without_frequency

    def load_file_without_frequency(self,positif, negatif):
        tab = []
        maxs = self.nbFeatures
        phrases = []
        y = []
        with codecs.open(positif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(1)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        with codecs.open(negatif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(0)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        word_fd = FreqDist(tab)
        print(word_fd)
        for i in range(len(phrases)):
		    mots = word_tokenize(phrases[i])
		    tmp  = []
		    for element in mots:
		        tmp.append(word_fd[element])
		    if(len(tmp) < maxs):
		        for j in range(maxs - len(tmp)):
		            tmp.append(0)
		    elif(len(tmp)>maxs):
		            tmp = tmp[:maxs]
		    phrases[i] = tmp
        return (np.array(phrases),np.array(list(set(tab))),np.array(y))
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:33,代码来源:neural.py


示例2: __init__

 def __init__(self, title, full_text, sentence):
     self.title = title
     self.sentence = sentence
     # map of word -> number of times it appears in the full article text
     self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text))
     # map of word -> number of times it appears in the given sentence
     self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
开发者ID:jeevnayak,项目名称:gapfill,代码行数:7,代码来源:keyword_chooser.py


示例3: vectorize

def vectorize(data, s):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
			[(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''

    vectors = {}
    labels = {}
    for (instance_id, left_context, head, right_context, sense_id) in data:
        labels[instance_id] = sense_id
        left_tokens = nltk.word_tokenize(left_context)
        right_tokens = nltk.word_tokenize(right_context)
        words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)
        vectors[instance_id] = frequency_vector_from_near_words(s, words)

    return vectors, labels
开发者ID:williamFalcon,项目名称:NLP_HW3,代码行数:26,代码来源:A.py


示例4: colocation

def colocation(windowSize, pos, context,dictionary):
    if windowSize<=0:
        return dictionary
    #going forward
    forward= context[:(pos)]
    f= forward[(-windowSize/2):]
    #going backward    
    backward= context[pos+1:]
    b= backward[:windowSize/2]
    for item in f:
        key= "pre"+str(len(f)-f.index(item))+"-word"
        value= item
        dictionary[key]=value
        key= "pre"+str(len(f)-f.index(item))+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    for item in b:
        key= "fol"+str(b.index(item)+1)+"-word"
        value= item
        dictionary[key]=value
        key= "fol"+str(b.index(item)+1)+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    return dictionary
开发者ID:ansuabraham,项目名称:cs4740_3,代码行数:26,代码来源:colocation.py


示例5: __tokenize

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
开发者ID:ishalyminov,项目名称:dstc5,代码行数:30,代码来源:baseline_slu.py


示例6: reading_level

def reading_level(full_text):
    #Clean the full_text
    full_text_clean = ""
    for char in full_text:
        if char == ".":
            full_text_clean += ". "
        else:
            full_text_clean += char

    #Language features
    import nltk
    words = nltk.word_tokenize(full_text_clean)

    n_sents = len(nltk.sent_tokenize(full_text_clean))
    n_words = len(nltk.word_tokenize(full_text_clean))

    #Count the syllables
    n_syll = 0
    for word in words:
        n_syll += syllable_count(word)

    #Calculate the reading level
    #https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

    grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words)
    return round(grade_level,1)
开发者ID:ECohen16,项目名称:rapid_reader,代码行数:26,代码来源:views.py


示例7: update

    def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                # import pdb;pdb.set_trace()
                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents)
开发者ID:Axighi,项目名称:Scripts,代码行数:27,代码来源:PosTagFreqVectorizer.py


示例8: main

def main(question, article):
  ddict = {}
  counts = get_counts()
  for tok in nltk.word_tokenize(article):
    ddict[tok] = ddict.get(tok, 0) + 1

  vec = []
  for tok in nltk.word_tokenize(question):

    # count in article
    tf = ddict.get(tok, 0) 

    # total articles is 108 / number that have current token
    idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
    vec.append(tf*idf)

  largest = max(vec)
  normalized = map(lambda y: y/largest, vec)

  finDic = {}
  for word,i in enumerate(nltk.word_tokenize(question)):
    finDic[word] = normalized[i]

  print finDic
  return finDic
开发者ID:NLP-Project,项目名称:NLP-project,代码行数:25,代码来源:tdIDF.py


示例9: next_note

def next_note(tokenizer):
    print 'SemEval data'
    for semeval_file in semeval_files:
        print 'File', semeval_file
        with open(semeval_file, 'r') as f:
            st = []
            for line in f:
                st += [line.strip()]
            text = read_visit_sem(st)
            text = tokenizer.tokenize(text)
            for sent in text:
                yield nltk.word_tokenize(sent.lower())
    print 'MIMIC data'
    for notes_file in subset(notes_files, 15): # 15 random MIMIC files
        print 'File', notes_file
        try:
            with open(notes_file, 'r') as f:
                ct = 0
                st = []
                for line in f:
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass
开发者ID:ankitkv,项目名称:MIMICTools,代码行数:33,代码来源:PhraseDetect.py


示例10: PushDataPair

def PushDataPair(data, database):
        last = len(database['Q'].keys())
        for pair in data:
                database['Q'][last] = nltk.word_tokenize(pair['question'])
                database['A'][last] = nltk.word_tokenize(pair['answer'])
                last += 1
        return database
开发者ID:echoyuzhou,项目名称:ticktock_text_api,代码行数:7,代码来源:Loader.py


示例11: build_s

def build_s(data):
    '''
    Compute the context vector for each lexelt
    :param data: dict with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dict s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    '''
    s = {}

    # implement your code here
    for key,value in data.items():
      for i in value:
        tokens_left = nltk.word_tokenize(i[1])
        tokens_right = nltk.word_tokenize(i[3])
        left = [w for w in tokens_left if w not in string.punctuation][-window_size:]
        right = [w for w in tokens_right if w not in string.punctuation][:window_size]
        context = left + right
        if key not in s:
          s[key]=[]
        for word in context:
          if word not in s[key]:
            s[key].append(word)            
          
    return s
开发者ID:jubimishra,项目名称:Natural-Language-Processing,代码行数:32,代码来源:A.py


示例12: paragraph_features

def paragraph_features(paragraph_sents):
    global count
    count += 1
    print '\r', count,

    if FEATURE == FEAT_CONTAINS:
        paragraph_words = set(
            sents_to_words(paragraph_sents)
        )
    elif FEATURE == FEAT_LINKED_TITLES:
        paragraph_words = ' '.join(paragraph_sents)
    elif FEATURE == FEAT_FIRST_SENT:
        paragraph_words = nltk.word_tokenize(
            paragraph_sents[0]
        )
    elif FEATURE == FEAT_BEGIN_SENT:
        paragraph_words = {
            nltk.word_tokenize(sent)[0]
            for sent in paragraph_sents
        }
    else:
        paragraph_words = None
        print 'FEATURE NOT SUPPORTED'
        exit()

    features = dict()
    for word in word_features:
        features[word_features[word]] = (
            word in paragraph_words
        )

    return features
开发者ID:mikeholler,项目名称:thesis-undergrad,代码行数:32,代码来源:classifier.py


示例13: synsym

def synsym(s1,s2):
    ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
    ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
    # adj  
    jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
    jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
    if len(jj0) == 0 or len(jj1) ==0:
      jjps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # noum  
    jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    if len(jj0) == 0 or len(jj1) ==0:
      nps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      nps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # verb
    jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    if len(jj0) == 0 or len(jj1) ==0:
      vps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      vps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))    
    return [jjps,nps,vps]
开发者ID:gtesei,项目名称:fast-furious,代码行数:31,代码来源:gensin_1.py


示例14: build_s

def build_s(data):
    """
    Compute the context vector for each lexelt
    :param data: dic with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dic s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    """
    s = {}

    # implement your code here

    for lexelt in data:
        words = set()
        for instance in data[lexelt]:

            left_context = word_tokenize(instance[1].strip())
            for token in left_context[-window_size:]:
                if token not in puncts:
                    words.add(token)

            right_context = word_tokenize(instance[3].strip())
            for token in right_context[:window_size]:
                if token not in puncts:
                    words.add(token)
        s[lexelt] = list(words)

    return s
开发者ID:keyu-lai,项目名称:NLP,代码行数:35,代码来源:A.py


示例15: parseFile

def parseFile(file):
	""" Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent 
		information about the file)
	"""
	#print file
	
	bindings 	= []
	
	
	# Load header file
	tokens 		= []
	if (file['header'] != ''):
		with open(file['header'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'header' )

	
	# Load source file
	tokens 		= []
	if (file['source'] != ''):
		with open(file['source'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'source' )	
	
	return bindings
开发者ID:jarrettchisholm,项目名称:pyliteserializer,代码行数:33,代码来源:pyliteserializer.py


示例16: nltk_filter

def nltk_filter(sent):
  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1            = b1.lower()
  tokens        = word_tokenize(b1)
  pos_tags      = pos_tag(tokens)
  filtered_sent = ' '
  for token in tokens:
    filtered_sent += '1'+token + ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

#note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2            = b2.lower()
  tokens        = word_tokenize(b2)
  pos_tags      = pos_tag(tokens)
  # filtered_sent = ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  for token in tokens:
    filtered_sent += '2' + token + ' '

  return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:30,代码来源:builder.py


示例17: read_liveqa

def read_liveqa(prefix = '../data/qalab-liveqa/dataset/qrels/', train = 'LiveQA2015-ver2.qrels', tokenize = True):
	import nltk

	f = open_file(prefix + train)
	np.random.seed(0)

	data_split = {0: [], 1 : [], 2 : []}
	ref_split = {0: [], 1 : [], 2 : []}

	for i,line in enumerate(f):
		l = line.strip().split('\t')
		if l[2] == '':
			first = " ? ".join(l[3].strip().split("?"))
			second = " . ".join(first.strip().split("."))
			q = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
			split_id = np.random.choice([0,0,0,1,2])
			continue
		label = int(l[2]) >= 3

		first = " ? ".join(l[3].strip().split("?"))
		second = " . ".join(first.strip().split("."))
		a = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
		data_split[split_id] += [(q,a,label,'','')]
		ref_split[split_id] += [(l[0],'0',l[0]+'_'+l[1]+'_'+str(i),str(int(label)))]

	return data_split[0],data_split[1],data_split[2],(ref_split[0],ref_split[1],ref_split[2])
开发者ID:wolet,项目名称:11797-project,代码行数:26,代码来源:prepare_data.py


示例18: stanford_corenlp_filter

def stanford_corenlp_filter(sent):
  from nltk.tag.stanford import POSTagger
  posTagger = POSTagger('/Users/gt/Downloads/'
                        'stanford-postagger-2013-06-20/models/'
                        'wsj-0-18-bidirectional-nodistsim.tagger',
                        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
                        '/stanford-postagger-3.2.0.jar',encoding=encoding)

  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1 = b1.lower()
  tokens = word_tokenize(b1)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

      #note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2 = b2.lower()
  tokens = word_tokenize(b2)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:32,代码来源:builder.py


示例19: extract_pos_pair

def extract_pos_pair(event_mention_1, event_mention_2):
    trigger1=""
    extent1=""
    trigger2=""
    extent2=""
    for one_anchor in event_mention_1.findall("anchor"):
        trigger1=one_anchor[0].text
    for one_anchor in event_mention_2.findall("anchor"):
        trigger2=one_anchor[0].text
    for one_extent in event_mention_1.findall("extent"):
        extent1=one_extent[0].text
    for one_extent in event_mention_2.findall("extent"):
        extent2=one_extent[0].text
    text1 = nltk.word_tokenize(extent1)
    dict1 = nltk.pos_tag(text1)
    for one_pair in dict1:
        if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
            pos1=one_pair[1]
            break
    text2 = nltk.word_tokenize(extent2)
    dict2 = nltk.pos_tag(text2)
    for one_pair in dict2:
        if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
            pos2=one_pair[1]
            break
    return (pos1, pos2)
开发者ID:wtl-zju,项目名称:KBP2015,代码行数:26,代码来源:coref_feature_extraction.py


示例20: checkTypeWordCount

def checkTypeWordCount(answer,question):
    count = 0
    status = ''
    sum = 0
    status1 = 'false'

    for word1 in word_tokenize(answer):
        if word1 == '.' or word1 == ',' or word1 == '\'' or word1 == '\"' or word1 == ':' or word1 == ';' or word1 == '?' or word1 == '/' or word1 == '\\' or word1 == '|' or word1 == ']' or word1 == '[' or word1 == '}' or word1 == '{' or word1 == '(' or word1 == ')' or word1 == '*' or word1 == '&' or word1 == '^' or word1 == '%' or word1 == '$' or word1 == '#' or word1 == '@' or word1 == '!' or word1 == '`' or word1 == '~' or word1 == '-' or word1 == '_' or word1 == '='or word1 == '+':
            print 'error'
        else:
            sum = sum +1
            #print word1
    print sum

    words_ans = word_tokenize(answer)
    words_qus = word_tokenize(question)
    if words_ans[0]=="NOTICE"or words_ans[0]=="Notice":
        print "Correct"
        count = count+0.25
    else:
        status = "Wrong"

    for word in words_qus:
        if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words':
            if sum >= word:
                print word
                count = count+0.25
            status1='true'

    if status1 == 'false':
        count = count+0.25
    return count,status
开发者ID:amilamadhushanka,项目名称:englishbuddy,代码行数:32,代码来源:notice.py



注:本文中的nltk.word_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.wordpunct_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.trigrams函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap