• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python twokenize.tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中twokenize.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: Extract

    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:34,代码来源:cap_classifier.py


示例2: kshinglize

def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s = s.strip().lower()
    tokens_raw = twokenize.tokenize(s)
    tokens = filterstopwords(tokens_raw, stopwords)
    return tokens_to_kshingles(tokens, k)
开发者ID:driscoll,项目名称:cluster,代码行数:7,代码来源:cluster.py


示例3: main

def main(argv):

    tagger = PerceptronTagger()
    tagset = None
    tokens = tokenize(line)
    tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    format_tagged(tags)
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:7,代码来源:tagAndLabel.py


示例4: learn_terms

 def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
     reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
     term_freq = Counter()
     term_id_map = dict()
     tweet_vectors = []
     for row in reader:
         tweet_id = int(row[0])
         tweet_text = row[-1]
         terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
         if learn_lemmas:
             terms = [self.lmtz.lemmatize(term) for term in terms]
         tweet_sp_vector = []
         counted_ids = []
         for term in terms:
             if term not in term_id_map:
                 term_id = len(term_id_map)
                 term_id_map[term] = term_id
             else:
                 term_id = term_id_map[term]
             if term_id not in counted_ids:
                 term_freq[term_id] += 1
                 counted_ids.append(term_id)
             tweet_sp_vector.append(term_id)
         tweet_vectors.append((tweet_id, tweet_sp_vector))
         if len(tweet_vectors) >= cache_size:
             self.write_tweet_vectors(tweet_vectors)
             tweet_vectors = []
     self.write_tweet_vectors(tweet_vectors)
     self.write_terms(term_id_map, term_freq)
开发者ID:zaycev,项目名称:n7,代码行数:29,代码来源:search.py


示例5: preprocess

def preprocess(m, sep_emoji=False):
    m = m.lower()    
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r"[email protected]+?( |$)|<@mention>"    
    m = re.sub(user_regex," @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url," url ", m, flags=re.I)        
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)         
        if len(n_toks)!=len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x:x in twk.punctuation, n_toks)
            if any(has_punct):  
                new_m = n_toks[0]
                for i in xrange(1,len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " "+n_toks[i]                   
                tokenized_msg = new_m                
    return tokenized_msg.lstrip()
开发者ID:samiroid,项目名称:utils,代码行数:27,代码来源:__init__.py


示例6: main

def main(argv):

    if len(sys.argv) != 3:
        print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
        exit()

    infile_name = str(sys.argv[1])
    outfile_name = str(sys.argv[2])

    infile = open(infile_name, 'r')
    outfile = open(outfile_name, 'w')

    tagger = PerceptronTagger()

    print("Reading file...")
    line = infile.readline()

    while line != '':
        # Use Twokenizer for twitter parser
        tagset = None
        tokens = tokenize(line)
        tags = nltk.tag._pos_tag(tokens, tagset, tagger)
        outfile.write(format_tagged(tags))
        line = infile.readline()

    # close file and connection
    infile.close()
    outfile.close()
    print("Finished tagging... Closing files.")
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:29,代码来源:getTaggedFile.py


示例7: __init__

 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:7,代码来源:cap_eval.py


示例8: process

	def process(self,text):
		
		tTweet = ""
		for word in text.split():
			if "#" in word:
				word = word.replace("#"," ")
				f=0
				for tt in self.remove:
					if tt in word:
						f=1
				if f==1:
					continue
			tTweet = " ".join([tTweet,word])
			tTweet = tTweet.strip()

		tempTweet = ""
		for word in twokenize.tokenize(tTweet):
			if word != " " and word not in self.stop and not word.isdigit():
				word = word.strip().lower()
				if len(word) > 26:
					word=word[:27]
				#### Normalize Emoticons
				try:
					word = self.emoticons[word]
				except:
					#Normalize Acronyms
					try:
						try:
							if  self.wordDict[word] ==1:
								word = word
						except:
							word = self.acronyms[word]
					except:
					#Normalize Contractions
						try:
							word = self.contractions[word]
						except:
							#Normalize words (Spell)
							try:
								if self.wordDict[word] == 1:
									word =	word
							except:
								CW = self.correct(word)
								if "@" in word or "#" in word:
									word = word
								else:
									if CW != "a":
										word = CW
				if "@" in word:
					word="@user"
				tempTweet = " ".join([tempTweet,word.strip()])
				tempTweet = tempTweet.lower().strip()
		tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
		#print(tempTweet.encode("utf-8"))
		return(tempTweet)

##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")
开发者ID:suddu16,项目名称:Youtube-Comedy-Comparison,代码行数:59,代码来源:PreprocessClass.py


示例9: process_line

def process_line(s, clean_string=True):
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    #return [process_token(None,token).lower() for token in tokens]
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:8,代码来源:createDictionaries.py


示例10: all_tokens

def all_tokens(tweetreader):
    i = 0
    for r in tweetreader:
        i += 1
        tokens = tokenize(r[-1])
        for t in tokens:
            yield t
        if i >= 50000:
            return
开发者ID:zaycev,项目名称:n7,代码行数:9,代码来源:pmi.py


示例11: process_line

def process_line(s, clean_string=True):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:10,代码来源:find_testfiles.py


示例12: get_idx_from_sent

def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x
开发者ID:BinbinBian,项目名称:ubottu,代码行数:12,代码来源:merge_data.py


示例13: process_statuses

 def process_statuses(self, statuses):
     statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
     for s in xrange(len(statuses)):
         w = 1
         while True:
             if w >= len(statuses[s]):
                 break
             if statuses[s][w][0] == "'":
                 statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
                 w = 0
             w += 1
     return statuses
开发者ID:goddardc,项目名称:nlp-twitter,代码行数:12,代码来源:main.py


示例14: tokenize_and_clean

def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
    return [toks[i] for i in inds]
开发者ID:AnnuSachan,项目名称:tweetmotif,代码行数:13,代码来源:bigrams.py


示例15: normalize_tweet

def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
    if lowercase:
        text = text.lower()
    text = re.sub(URL_PATTERN, 'URL', text)
    tokens = twokenize.tokenize(text)
    if return_tokens:
        if rm_digits:
            tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
        return tokens
    clean = ' '.join(tokens)
    if rm_digits:
        re.sub(NUM_PATTERN, 'NUM', clean)
    return clean
开发者ID:imgemp,项目名称:semeval16,代码行数:13,代码来源:__init__.py


示例16: preprocess

def preprocess(tweet):
    abbv_dict = json.load(open("../other/abbreviations.json"))
    emo_lexica_dict = json.load(open("../other/emotions.json"))
    for emoticon in emo_lexica_dict[u'emoticons']:
        abbv_dict[emoticon] = ' '
    for word in emo_lexica_dict[u'words']:
        abbv_dict[word] = ' '
    hash_transformer = Transformer.HashtagTransformer()
    sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
    preprocessor = Preprocessor([hash_transformer, sub_transformer])
    tweet = ' '.join(tokenize(tweet))
    tweet = preprocessor.transform(tweet)
    return tweet
开发者ID:i-DAT,项目名称:emotionannotate,代码行数:13,代码来源:Preprocessor.py


示例17: process_line

def process_line(s, clean_string=True, enable_tags = False):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
            s = clean_str(s)
    tokens = tokenize(s)
    if enable_tags:
        sent = nltk.pos_tag(tokens)
        chunks = nltk.ne_chunk(sent, binary=False)
        words = []
        for chunk in chunks:
            words += process_chunk(chunk)
        return [w.lower().encode('UTF-8') for w in words]
    else:
        return [process_token(token).lower().encode('UTF-8') for token in tokens]
开发者ID:pl8787,项目名称:UbuntuDataGenerator,代码行数:16,代码来源:TextPreprocess.py


示例18: parse_tweets

def parse_tweets(tweets):
    parsed_tweets =[]
    for tweet_json in tweets:
	try:
	    #tweet_json = json.loads(tweet_str);
	    tweet_text = tweet_json['text'];
	    if u'RT' in tweet_text:	
		tweet_text = tweet_text[0:tweet_text.index(u'RT') -1]
		
	    tweet_token = tk.tokenize(tweet_text)
	    tweet_token =[char_reduction(tok) for tok in tweet_token]
	    tweet_token = [t for tok in tweet_token for t in es.expand(tok) if (not (('@' in t) or (tk.Url_RE.search(t)) or (not emo.Emoticon_RE.search(t) and tk.Punct_re.search(t))))]
	    
	    if tweet_token != []:
		tweet_obj = {"token":tweet_token,"location" : tweet_json['place']['country'] if tweet_json['place'] != None else None,"json":tweet_json,"type" :""}
		parsed_tweets.append(tweet_obj)
	except Exception as e:
	    print e
	    
    return parsed_tweets
开发者ID:siddharthmodala,项目名称:twittersentiment,代码行数:20,代码来源:parse.py


示例19: read_tweets

    def read_tweets(self, filename, emo):
        """Read tweets in raw format, returning a list of all tweets in the file"""
        emo_tweets = []
        non_emo_tweets = []
        with codecs.open(filename, encoding='utf8') as tweet_file:
#            tweet = []
            for line in tweet_file:
                data = json.loads(line)
                id = data['tweetid'].strip()
                text = data['text'].strip()
                emotions = data['emotions']
                tokens = tokenize(text)
                incount = 0
                for e in emotions:
                    if e == emo:
                        incount = 1
                if incount == 1:
                    emo_tweets.append(SPACE.join(tokens))
                elif incount == 0:
                    non_emo_tweets.append(SPACE.join(tokens))    
        return emo_tweets, non_emo_tweets
开发者ID:i-DAT,项目名称:emotionannotate,代码行数:21,代码来源:Preprocessor.py


示例20: __init__

    def __init__(self, line):
        fields = line.split('","')
        if fields[0] == '"0':
            self.senti = -1
        elif fields[0] == '"2':
            self.senti = 0
        elif fields[0] == '"4':
            self.senti = 1
        self.id = fields[1]
        self.date = fields[2]
        # self.text = fields[5][1:-1]
        self.text = normalization(fields[5][:-1])
        tokens = tokenize(self.text)
        self.tokens = tokens
        tokens_postag = nltk.pos_tag(tokens)
        wordnet_tag = []
        for each_pair in tokens_postag:
            if 'NN' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'n'))
            if 'JJ' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'a'))
            elif 'RB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'r'))
            elif 'VB' in each_pair[1]:
                wordnet_tag.append((each_pair[0], 'v'))

        # lemmatized tokens are lemmatized and lowered
        self.ltoken_tag = []
        for each_pair in wordnet_tag:
            lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
            self.ltoken_tag.append((lword.lower(), each_pair[1]))

        self.tweet_senti_score = []

        for each_pair in self.ltoken_tag:
            each_score = sentiextractor.get_score(each_pair)
            if abs(each_score) > 0.02:
                self.tweet_senti_score.append(each_score)
            else:
                self.tweet_senti_score.append(0)
开发者ID:alwayforver,项目名称:demoBasic,代码行数:40,代码来源:tweetAnalysis.py



注:本文中的twokenize.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python config.Config类代码示例发布时间:2022-05-27
下一篇:
Python utils.default_device函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap