• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python tokenize.regexp_tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.tokenize.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了regexp_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: get_score

 def get_score(self, document, lang):
     # Extract ngrams
     unigrams = regexp_tokenize(document, pattern_unigrams)
     bigrams = regexp_tokenize(document, pattern_bigrams) 
     #Create frequency distributions    
     doc_fdist = FreqDist(unigrams + bigrams)
     sim = cosineOnDicts(self._prototypes[lang], doc_fdist, self._union)
     return sim
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:8,代码来源:identify_language.py


示例2: find_version

def find_version(text):
    digit_pattern = r"(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)"
    pattern = "\s?[vV]ersion\s?" + digit_pattern
    pattern += "| [vV]er\s?\.?\s?" + digit_pattern
    pattern += "| [vV]\s?\.?\s?" + digit_pattern
    version_matches = regexp_tokenize(text, pattern)
    pattern = digit_pattern + "$"
    versions = []
    for version in version_matches:
        matches = regexp_tokenize(version, pattern)
        for match in matches:
            versions.append(match)
    return versions
开发者ID:taxomania,项目名称:TwitterMining,代码行数:13,代码来源:text_utils.py


示例3: are_duplicates

def are_duplicates(doc1, doc2):
    if len(doc1) > 50 and len(doc2) > 50 and  not are_duplicates(doc1[:50], doc2[:50]): 
        return False
    txt_tokens_1 = regexp_tokenize(doc1, pattern_words)
    txt_tokens_2 = regexp_tokenize(doc2, pattern_words)
    ngrams_1 = txt_tokens_1 + generate_ngrams(txt_tokens_1, 2)
    ngrams_2 = txt_tokens_2 + generate_ngrams(txt_tokens_2, 2)
    overlap = len([w for w in ngrams_1 if w in ngrams_2])
    score = (2*overlap)/(len(ngrams_1) + len(ngrams_1) + 1)
    if score > 0.8: 
        return True
    else:
        return False    
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:13,代码来源:check_duplicates.py


示例4: __init__

    def __init__(self, sentence):
        self.sentence = sentence

        self.forms = []
        for s in tuple(open(FORMS, "r")):  # read the user_forms from file
            self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()])

        if self.is_valid():
            self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+")  # tokenizing with regex
            self.stop_words = set(stop.words("english"))  # filtering tokens words to remove
            self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words]  # remove stop words
            self.spell_checked = self.spell_check()
            self.tags = pos_tag(self.spell_checked, tagset="universal")  # speech tagging (identification)
            print(self.tags)
            self.digits = self.get_digits()
            self.user_form = self.get_user_form()
开发者ID:master-vic,项目名称:nltk-test,代码行数:16,代码来源:mathieu.py


示例5: word_split

def word_split(text):
    """
    Split a text in words. Returns a list of tuple that contains
    word.
    """
    a = regexp_tokenize(text.lower().strip(), pattern=r'\w+') 
    return a
开发者ID:kuberkaul,项目名称:Information-Retrieval-System,代码行数:7,代码来源:index.py


示例6: main

def main():
    args = argument_parser.main()
    global sql
    sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db)
    global bing
    bing = BingSearch()
    global new_software
    new_software = NewSoftware()
    global possible_tags
    possible_tags = []
    mongo = MongoConnector(host=args.H, db=args.db)
    for page in range(1):
        res = sql.load_data(page)
        rows = res.num_rows()
        if not rows:
            print "No tweets left to analyse"
            break

        for _i_ in range(1):  # rows):
            for tweet in res.fetch_row():
                tweet_id = str(tweet[0])
                text = tweet[1].lower()
                # text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence"

                urls = find_url(text)
                for url in urls:
                    text = text.replace(url, "").strip()

                versions = find_version(text)

                words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+")
                # print words
                prices = find_price(words)

                pos_ = pos(words)
                ngram = ngrams(words, 5)

                try:
                    tagged_tweet = tag_tweets(ngram, tweet_id)
                    tagged_tweet.add("tweet_text", text)
                    tagged_tweet.add("sentiment", tweet[2])
                    tagged_tweet.add("url", urls)
                    tagged_tweet.add("version", versions)
                    tagged_tweet.add("price", prices)
                    if tweet_id in possible_tags:
                        print tweet_id
                    else:
                        if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"):
                            print tweet
                            print tagged_tweet
                            print
                            # mongo.insert(tagged_tweet)
                        else:
                            print tweet, "No software"
                        # sql.setTagged(tagged_tweet.get('tweet_db_id'))
                except IncompleteTaggingError, e:
                    # This will allow the tweet to be tagged again at a later stage
                    print tweet_id + ":", e
                    print tweet
                    print
开发者ID:taxomania,项目名称:TwitterMining,代码行数:60,代码来源:tweet_tagging.py


示例7: simhash

    def simhash(raw_text):
        """Compute the simhash value for a string."""
        fdist = FreqDist()
        for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
            fdist.inc(word.lower())

        v = [0] * 128

        for word in fdist:
            projection = bitarray()
            projection.fromstring(hashlib.md5(word).digest())
            #print "\tw:%s, %d" % (word, fdist[word])
            #print "\t\t 128 bit hash: " + str(b)

            for i in xrange(128):
                if projection[i]:
                    v[i] += fdist.get(word)
                else:
                    v[i] -= fdist.get(word)


        hash_val = bitarray(128)
        hash_val.setall(False)

        for i in xrange(128):
            if v[i] > 0:
                hash_val[i] = True
        return hash_val
开发者ID:TPNguyen,项目名称:neardups,代码行数:28,代码来源:test_deal.py


示例8: identify_language

 def identify_language(self, document, default_lang = None):
     # Extract ngrams
     unigrams = regexp_tokenize(document, pattern_unigrams)
     bigrams = regexp_tokenize(document, pattern_bigrams) 
     
     #Create frequency distributions    
     doc_fdist = FreqDist(unigrams + bigrams)
     predicted_lang = default_lang
     max_sim = 0.5
     for k,v in self._prototypes.items():
         sim = cosineOnDicts(v, doc_fdist, self._union)
         if sim > max_sim:
             max_sim = sim
             predicted_lang = k
              
     return predicted_lang
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:16,代码来源:identify_language.py


示例9: getTokenizedQueries

def getTokenizedQueries():
	queriesFileName = "../cacm.query"

	f = open(queriesFileName, 'r')
	i = 0
	queriesList = {}
	isText = False
	for lineWithEnter in f:
		line = lineWithEnter[:-1]

		if len(line) == 0:
			continue
		elif line[0] == '<' or (line[0] == ' ' and len(line) == 1):
			isText = False
			continue
		else:
			if not isText:
				isText = True
				queriesList[i] = ""
				queriesList[i] += line
				i += 1
			else:
				queriesList[i - 1] += " "
				queriesList[i - 1] += line
			# print line

	tokenizedQueriesList = {}
	for q in queriesList:
		tokenizedQueriesList[q] = regexp_tokenize(queriesList[q], pattern='[\d]+[\.\,\d]*[\d]+\%?|\[\d+\]|[\w\-]+')

	return tokenizedQueriesList
开发者ID:xuweineo,项目名称:CS6200---Information-Retrieval---Final-Project,代码行数:31,代码来源:parseQueries.py


示例10: tokenizeList

def tokenizeList(tokenList):
      #remove stop words, punctuation & stem words to create tokens out of phrases and names
      tokenized_list = []
      
      for item in tokenList:
         tokenized = regexp_tokenize(item.lower(), "[\w']+")
         for word in tokenized:
           if word not in english_stops: 
               stemmed = stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) 
               if not stemmed.isalpha(): 
		 if stemmed.isdigit():
		   stemmed = 'NUMBER'
		   tokenized_list.append(stemmed)
		 elif stemmed.isalnum(): 
		   stemmed = 'ALPHANUM'
		   tokenized_list.append(stemmed)
               else:
                   tokenized_list.append(stemmed) 
         '''
         filtered = [word for word in tokenized if word not in english_stops] 
         stemmed  = [stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) for word in filtered]   
         stemmed  = [word for word in stemmed if word !='']
	 tokenized_list.extend(stemmed)
         '''

      return tokenized_list
开发者ID:tgebru,项目名称:fb_movie_recs,代码行数:26,代码来源:preprocess.py


示例11: tag_and_tokenize

	def tag_and_tokenize(self,file):
		'''Tokenize, Chuncks and tags string 's' the bulk of the script work (time) is done here'''
		self.text = get_docx_text(file)
		self.sentences = ""
		print("Tokenize and tagging...")
		self.sentences = regexp_tokenize(self.text, pattern='\w+|\$[\d\.]+|\S+')
		self.sentences = [st.tag(self.sentences)]
		print("Tagging done")
开发者ID:DewarM,项目名称:cvParser,代码行数:8,代码来源:my_parser.py


示例12: words

def words(text, splitContractions=False, contractChars = ["'"]):
    '''uses a regexpTokenizer to tokenize text to words. If splitContractions is true,
    the regex pattern is [\w]+ so that contractions are split, e.g. "I can't" -> ['I','can','t'],
    otherwise the regex pattern is [\w']+ so that contractions are not split, i.e. "I can't" -> ['I', "can't"]
    Additional contract characters, e.g. a hyphen, can be added by over riding the contractChars arg'''
    if splitContractions:
        pat = "[\w]+"
    else:
        pat = "[\w{0}]+".format(reduce(lambda x,y: x+y, contractChars, ""))
    return regexp_tokenize(text, pat, discard_empty=True)
开发者ID:danielforsyth,项目名称:arrc,代码行数:10,代码来源:nlp.py


示例13: index

def index(request):
    if request.method == "POST":
        if request.POST.get("tokens"):
            with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
                corpus = pickle.load(handle)

            tokens = ast.literal_eval(request.POST.get("tokens"))
            tagged = []
            i = 1
            for item in tokens:
                tagged.append((item,request.POST.get("token_"+str(i))))
                i += 1
            if tagged not in corpus:
                corpus.append(tagged)
                with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
                    pickle.dump(corpus, handle)
                tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
                symbols = unique_list(word for sent in corpus for (word,tag) in sent)
                trainer = HiddenMarkovModelTrainer(tag_set, symbols)
                hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
                with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
                    pickle.dump(hmm, handle)

            return render(request, 'tagger/index.html', {'corpus': corpus})

        else:
            if request.POST.get("random") == 'true':
                address = get_random_address()
                if not address:
                    return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'})

            else:
                address = request.POST.get("address")

            tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )

            if tokens:
                pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
                hmm = pickle.load(pkl_file)
                pkl_file.close()

                tagged = hmm.tag(tokens)

                tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
                reader = codecs.getreader("utf-8")
                tags = json.load(reader(tags_file))
                tags_file.close()

                return render(request, 'tagger/index.html', {'address': address,
                                                              'tokens': tokens,
                                                              'tagged': tagged,
                                                              'tags': sorted(tags.items(), key=operator.itemgetter(1)) })

    return render(request, 'tagger/index.html', {})
开发者ID:bolvano,项目名称:hmm-ap,代码行数:54,代码来源:views.py


示例14: getReviews

def getReviews(rootdir):
    reviews = []
    unique = []
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder,filename),'r') as src:
                review = src.read()
                words = regexp_tokenize(review,"\w+")
                for word in words:
                    unique.append(word)
                reviews.append(review)
    return reviews
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:12,代码来源:reader.py


示例15: _tokenize_content

 def _tokenize_content(self):
     tokenized_content = []
     raw_content = self._clean_content()
     content_sents = sent_tokenize(raw_content)
     content_words_by_sents = map(lambda sent: word_tokenize(sent), content_sents)
     stopwords = regexp_tokenize(STOPWORDS, "[\w']+")
     extra_puncts = ['),', ').', '%),', '%).', '):', '()', '://', '>.', '.;', '...', '/>.']
     puncts = list(punctuation) + extra_puncts
     stopwords.extend(puncts)
     for sent in content_words_by_sents:
         clean_sent = [word for word in sent if word not in stopwords]
         tokenized_content.append(clean_sent)
     return tokenized_content
开发者ID:hudsonsferreira,项目名称:YakinduParser,代码行数:13,代码来源:yakindu_parser.py


示例16: get_features

def get_features(review,polarity):
    features = {}
    uniqueWords = 0
    personalRatio = 0
    personal = 0
    misspelt = 0
    hotelName = 0
    personalPronouns = ["i","me","we","our","ours","mine"]
    sentences = sent_tokenize(review)
    sent = nltk.word_tokenize(review)

    s = len(sentences)
    wordsR = regexp_tokenize(review,"\w+")
    for x in wordsR:
        if x in personalPronouns:
            personal+=1
        #if x not in set(words.words()):
            #misspelt+=1
        if x in hotels:
            hotelName+=1
    w = len(wordsR)
    unique = len(set(wordsR))
    uniqueWords+=unique
    review = review.replace(" ","")
    c = len(review)
    cap = 0
    features['dollar'] = False
    for i in range(len(review)):
        if review[i].isupper:
            cap+=1
        if review[i] == '$':
            features['dollar'] = True
    ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
    capRatio = c/float(s)
    personalRatio += float(personal)/w
    features['uniqueWords'] = uniqueWords
    features['personalRatio'] = personalRatio
    features['ari'] = ari
    features['capRatio'] = capRatio
    features['polarity'] = polarity
    features['hotel'] = hotelName
    ngrams = get_bigrams(review,'x')
    sentiments = get_sentimentFeatures(review,'x')
    for x in ngrams.keys():
        features[x] = ngrams[x]
    for x in sentiments.keys():
        features[x] = sentiments[x]
    features['misspelt'] = misspelt
    return features
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:49,代码来源:reader.py


示例17: tokenize_text

def tokenize_text(page_text):
    """
    Tokenizes text using NLTK and regEx   
    """

    pattern = r"""(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]"""

    tokens = regexp_tokenize(page_text.strip().lower(), pattern)
    tokens = [cleanup(w) for w in tokens]

    tokens = [w for w in tokens if ((len(w) > 1) and (money(w) or alpha_num(w)))]

    tokens = [LMTZR.lemmatize(w) for w in tokens]

    return tokens
开发者ID:PhaniJella,项目名称:grisham,代码行数:15,代码来源:create_paper_corpus.py


示例18: tokenize_text

def tokenize_text(page_text):
    '''
    Tokenizes text using NLTK and regEx   
    '''

    pattern = r'''(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]'''
    remove_list = ["[", "]", "{", "}", "(", ")", 
              "'", ".", "..", "...", ",", "?", "!", 
              "/", "\"", "\"", ";", ":", "-", "�", "_", "�", "�", 
              "`", "~", "@", "$", "^", "|", "#", "=", "*", "?"];
    ## making it to lower case may affect the performance
    tokens = regexp_tokenize(page_text, pattern)

    ## Removes unnecessary words 
    wt = [w for w in tokens if ((w not in remove_list) and (len(w) > 1))];        

    return wt;
开发者ID:clintpgeorge,项目名称:hornbill,代码行数:17,代码来源:build_word_stream.py


示例19: countW

def countW(rootdir):
    reviews = []
    unique = []
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder,filename),'r') as src:
                review = src.read()
                words = regexp_tokenize(review,"\w+")
                for word in words:
                    unique.append(word)
                reviews.append(review)
    unique = set(unique)
    uniqueR = []
    for w in unique:
        if w not in stopwords.words('english'):
            uniqueR.append(w)
    print (len(set(uniqueR)))
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:17,代码来源:reader.py


示例20: calculateAGARI

def calculateAGARI(rootdir):
    avgARI = 0
    count = 0
    uniqueWords = 0
    personalRatio = 0
    dollarCount = 0
    personalPronouns = ["i","me","we","our","ours","mine"]
    hotelName = 0
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder, filename), 'r') as src:
                review = src.read()
                personal = 0
                sentences = sent_tokenize(review)
                s = len(sentences)
                capitals = 0
                words = regexp_tokenize(review,"\w+")
                for x in words:
                    if x in personalPronouns:
                        personal+=1
                    if x in hotels:
                        hotelName+=1
                w = len(words)
                unique = len(set(words))
                uniqueWords+=unique
                review = review.replace(" ","")
                flag = "f"
                for i in range(len(review)):
                    if review[i].isupper():
                        capitals+=1
                    if review[i] == '$':
                        flag = "t"
                if flag=="t":
                    dollarCount+=1
                c = len(review)
                ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
                avgARI += ari
                count += 1
                personalRatio += float(personal)/w
                #print(nltk.ne_chunk(review))
    print("\n"+rootdir)
    print("ARI : "+str(float(avgARI/count)))
    print("Unique words"+" "+str(uniqueWords/float(count)))
    print("Ratio personal : "+str(personalRatio/float(count)))
    print("DollarCount :"+str(dollarCount))
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:45,代码来源:reader.py



注:本文中的nltk.tokenize.regexp_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokenize.sent_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python tgrep.tgrep_tokenize函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap