本文整理汇总了Python中nltk.tokenize.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了regexp_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_score
def get_score(self, document, lang):
# Extract ngrams
unigrams = regexp_tokenize(document, pattern_unigrams)
bigrams = regexp_tokenize(document, pattern_bigrams)
#Create frequency distributions
doc_fdist = FreqDist(unigrams + bigrams)
sim = cosineOnDicts(self._prototypes[lang], doc_fdist, self._union)
return sim
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:8,代码来源:identify_language.py
示例2: find_version
def find_version(text):
digit_pattern = r"(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)"
pattern = "\s?[vV]ersion\s?" + digit_pattern
pattern += "| [vV]er\s?\.?\s?" + digit_pattern
pattern += "| [vV]\s?\.?\s?" + digit_pattern
version_matches = regexp_tokenize(text, pattern)
pattern = digit_pattern + "$"
versions = []
for version in version_matches:
matches = regexp_tokenize(version, pattern)
for match in matches:
versions.append(match)
return versions
开发者ID:taxomania,项目名称:TwitterMining,代码行数:13,代码来源:text_utils.py
示例3: are_duplicates
def are_duplicates(doc1, doc2):
if len(doc1) > 50 and len(doc2) > 50 and not are_duplicates(doc1[:50], doc2[:50]):
return False
txt_tokens_1 = regexp_tokenize(doc1, pattern_words)
txt_tokens_2 = regexp_tokenize(doc2, pattern_words)
ngrams_1 = txt_tokens_1 + generate_ngrams(txt_tokens_1, 2)
ngrams_2 = txt_tokens_2 + generate_ngrams(txt_tokens_2, 2)
overlap = len([w for w in ngrams_1 if w in ngrams_2])
score = (2*overlap)/(len(ngrams_1) + len(ngrams_1) + 1)
if score > 0.8:
return True
else:
return False
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:13,代码来源:check_duplicates.py
示例4: __init__
def __init__(self, sentence):
self.sentence = sentence
self.forms = []
for s in tuple(open(FORMS, "r")): # read the user_forms from file
self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()])
if self.is_valid():
self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+") # tokenizing with regex
self.stop_words = set(stop.words("english")) # filtering tokens words to remove
self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words] # remove stop words
self.spell_checked = self.spell_check()
self.tags = pos_tag(self.spell_checked, tagset="universal") # speech tagging (identification)
print(self.tags)
self.digits = self.get_digits()
self.user_form = self.get_user_form()
开发者ID:master-vic,项目名称:nltk-test,代码行数:16,代码来源:mathieu.py
示例5: word_split
def word_split(text):
"""
Split a text in words. Returns a list of tuple that contains
word.
"""
a = regexp_tokenize(text.lower().strip(), pattern=r'\w+')
return a
开发者ID:kuberkaul,项目名称:Information-Retrieval-System,代码行数:7,代码来源:index.py
示例6: main
def main():
args = argument_parser.main()
global sql
sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db)
global bing
bing = BingSearch()
global new_software
new_software = NewSoftware()
global possible_tags
possible_tags = []
mongo = MongoConnector(host=args.H, db=args.db)
for page in range(1):
res = sql.load_data(page)
rows = res.num_rows()
if not rows:
print "No tweets left to analyse"
break
for _i_ in range(1): # rows):
for tweet in res.fetch_row():
tweet_id = str(tweet[0])
text = tweet[1].lower()
# text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence"
urls = find_url(text)
for url in urls:
text = text.replace(url, "").strip()
versions = find_version(text)
words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+")
# print words
prices = find_price(words)
pos_ = pos(words)
ngram = ngrams(words, 5)
try:
tagged_tweet = tag_tweets(ngram, tweet_id)
tagged_tweet.add("tweet_text", text)
tagged_tweet.add("sentiment", tweet[2])
tagged_tweet.add("url", urls)
tagged_tweet.add("version", versions)
tagged_tweet.add("price", prices)
if tweet_id in possible_tags:
print tweet_id
else:
if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"):
print tweet
print tagged_tweet
print
# mongo.insert(tagged_tweet)
else:
print tweet, "No software"
# sql.setTagged(tagged_tweet.get('tweet_db_id'))
except IncompleteTaggingError, e:
# This will allow the tweet to be tagged again at a later stage
print tweet_id + ":", e
print tweet
print
开发者ID:taxomania,项目名称:TwitterMining,代码行数:60,代码来源:tweet_tagging.py
示例7: simhash
def simhash(raw_text):
"""Compute the simhash value for a string."""
fdist = FreqDist()
for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
fdist.inc(word.lower())
v = [0] * 128
for word in fdist:
projection = bitarray()
projection.fromstring(hashlib.md5(word).digest())
#print "\tw:%s, %d" % (word, fdist[word])
#print "\t\t 128 bit hash: " + str(b)
for i in xrange(128):
if projection[i]:
v[i] += fdist.get(word)
else:
v[i] -= fdist.get(word)
hash_val = bitarray(128)
hash_val.setall(False)
for i in xrange(128):
if v[i] > 0:
hash_val[i] = True
return hash_val
开发者ID:TPNguyen,项目名称:neardups,代码行数:28,代码来源:test_deal.py
示例8: identify_language
def identify_language(self, document, default_lang = None):
# Extract ngrams
unigrams = regexp_tokenize(document, pattern_unigrams)
bigrams = regexp_tokenize(document, pattern_bigrams)
#Create frequency distributions
doc_fdist = FreqDist(unigrams + bigrams)
predicted_lang = default_lang
max_sim = 0.5
for k,v in self._prototypes.items():
sim = cosineOnDicts(v, doc_fdist, self._union)
if sim > max_sim:
max_sim = sim
predicted_lang = k
return predicted_lang
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:16,代码来源:identify_language.py
示例9: getTokenizedQueries
def getTokenizedQueries():
queriesFileName = "../cacm.query"
f = open(queriesFileName, 'r')
i = 0
queriesList = {}
isText = False
for lineWithEnter in f:
line = lineWithEnter[:-1]
if len(line) == 0:
continue
elif line[0] == '<' or (line[0] == ' ' and len(line) == 1):
isText = False
continue
else:
if not isText:
isText = True
queriesList[i] = ""
queriesList[i] += line
i += 1
else:
queriesList[i - 1] += " "
queriesList[i - 1] += line
# print line
tokenizedQueriesList = {}
for q in queriesList:
tokenizedQueriesList[q] = regexp_tokenize(queriesList[q], pattern='[\d]+[\.\,\d]*[\d]+\%?|\[\d+\]|[\w\-]+')
return tokenizedQueriesList
开发者ID:xuweineo,项目名称:CS6200---Information-Retrieval---Final-Project,代码行数:31,代码来源:parseQueries.py
示例10: tokenizeList
def tokenizeList(tokenList):
#remove stop words, punctuation & stem words to create tokens out of phrases and names
tokenized_list = []
for item in tokenList:
tokenized = regexp_tokenize(item.lower(), "[\w']+")
for word in tokenized:
if word not in english_stops:
stemmed = stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation)
if not stemmed.isalpha():
if stemmed.isdigit():
stemmed = 'NUMBER'
tokenized_list.append(stemmed)
elif stemmed.isalnum():
stemmed = 'ALPHANUM'
tokenized_list.append(stemmed)
else:
tokenized_list.append(stemmed)
'''
filtered = [word for word in tokenized if word not in english_stops]
stemmed = [stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) for word in filtered]
stemmed = [word for word in stemmed if word !='']
tokenized_list.extend(stemmed)
'''
return tokenized_list
开发者ID:tgebru,项目名称:fb_movie_recs,代码行数:26,代码来源:preprocess.py
示例11: tag_and_tokenize
def tag_and_tokenize(self,file):
'''Tokenize, Chuncks and tags string 's' the bulk of the script work (time) is done here'''
self.text = get_docx_text(file)
self.sentences = ""
print("Tokenize and tagging...")
self.sentences = regexp_tokenize(self.text, pattern='\w+|\$[\d\.]+|\S+')
self.sentences = [st.tag(self.sentences)]
print("Tagging done")
开发者ID:DewarM,项目名称:cvParser,代码行数:8,代码来源:my_parser.py
示例12: words
def words(text, splitContractions=False, contractChars = ["'"]):
'''uses a regexpTokenizer to tokenize text to words. If splitContractions is true,
the regex pattern is [\w]+ so that contractions are split, e.g. "I can't" -> ['I','can','t'],
otherwise the regex pattern is [\w']+ so that contractions are not split, i.e. "I can't" -> ['I', "can't"]
Additional contract characters, e.g. a hyphen, can be added by over riding the contractChars arg'''
if splitContractions:
pat = "[\w]+"
else:
pat = "[\w{0}]+".format(reduce(lambda x,y: x+y, contractChars, ""))
return regexp_tokenize(text, pat, discard_empty=True)
开发者ID:danielforsyth,项目名称:arrc,代码行数:10,代码来源:nlp.py
示例13: index
def index(request):
if request.method == "POST":
if request.POST.get("tokens"):
with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
corpus = pickle.load(handle)
tokens = ast.literal_eval(request.POST.get("tokens"))
tagged = []
i = 1
for item in tokens:
tagged.append((item,request.POST.get("token_"+str(i))))
i += 1
if tagged not in corpus:
corpus.append(tagged)
with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
pickle.dump(corpus, handle)
tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
symbols = unique_list(word for sent in corpus for (word,tag) in sent)
trainer = HiddenMarkovModelTrainer(tag_set, symbols)
hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
pickle.dump(hmm, handle)
return render(request, 'tagger/index.html', {'corpus': corpus})
else:
if request.POST.get("random") == 'true':
address = get_random_address()
if not address:
return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'})
else:
address = request.POST.get("address")
tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )
if tokens:
pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
hmm = pickle.load(pkl_file)
pkl_file.close()
tagged = hmm.tag(tokens)
tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
reader = codecs.getreader("utf-8")
tags = json.load(reader(tags_file))
tags_file.close()
return render(request, 'tagger/index.html', {'address': address,
'tokens': tokens,
'tagged': tagged,
'tags': sorted(tags.items(), key=operator.itemgetter(1)) })
return render(request, 'tagger/index.html', {})
开发者ID:bolvano,项目名称:hmm-ap,代码行数:54,代码来源:views.py
示例14: getReviews
def getReviews(rootdir):
reviews = []
unique = []
for folder, subs, files in os.walk(rootdir):
for filename in files:
with open(os.path.join(folder,filename),'r') as src:
review = src.read()
words = regexp_tokenize(review,"\w+")
for word in words:
unique.append(word)
reviews.append(review)
return reviews
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:12,代码来源:reader.py
示例15: _tokenize_content
def _tokenize_content(self):
tokenized_content = []
raw_content = self._clean_content()
content_sents = sent_tokenize(raw_content)
content_words_by_sents = map(lambda sent: word_tokenize(sent), content_sents)
stopwords = regexp_tokenize(STOPWORDS, "[\w']+")
extra_puncts = ['),', ').', '%),', '%).', '):', '()', '://', '>.', '.;', '...', '/>.']
puncts = list(punctuation) + extra_puncts
stopwords.extend(puncts)
for sent in content_words_by_sents:
clean_sent = [word for word in sent if word not in stopwords]
tokenized_content.append(clean_sent)
return tokenized_content
开发者ID:hudsonsferreira,项目名称:YakinduParser,代码行数:13,代码来源:yakindu_parser.py
示例16: get_features
def get_features(review,polarity):
features = {}
uniqueWords = 0
personalRatio = 0
personal = 0
misspelt = 0
hotelName = 0
personalPronouns = ["i","me","we","our","ours","mine"]
sentences = sent_tokenize(review)
sent = nltk.word_tokenize(review)
s = len(sentences)
wordsR = regexp_tokenize(review,"\w+")
for x in wordsR:
if x in personalPronouns:
personal+=1
#if x not in set(words.words()):
#misspelt+=1
if x in hotels:
hotelName+=1
w = len(wordsR)
unique = len(set(wordsR))
uniqueWords+=unique
review = review.replace(" ","")
c = len(review)
cap = 0
features['dollar'] = False
for i in range(len(review)):
if review[i].isupper:
cap+=1
if review[i] == '$':
features['dollar'] = True
ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
capRatio = c/float(s)
personalRatio += float(personal)/w
features['uniqueWords'] = uniqueWords
features['personalRatio'] = personalRatio
features['ari'] = ari
features['capRatio'] = capRatio
features['polarity'] = polarity
features['hotel'] = hotelName
ngrams = get_bigrams(review,'x')
sentiments = get_sentimentFeatures(review,'x')
for x in ngrams.keys():
features[x] = ngrams[x]
for x in sentiments.keys():
features[x] = sentiments[x]
features['misspelt'] = misspelt
return features
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:49,代码来源:reader.py
示例17: tokenize_text
def tokenize_text(page_text):
"""
Tokenizes text using NLTK and regEx
"""
pattern = r"""(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]"""
tokens = regexp_tokenize(page_text.strip().lower(), pattern)
tokens = [cleanup(w) for w in tokens]
tokens = [w for w in tokens if ((len(w) > 1) and (money(w) or alpha_num(w)))]
tokens = [LMTZR.lemmatize(w) for w in tokens]
return tokens
开发者ID:PhaniJella,项目名称:grisham,代码行数:15,代码来源:create_paper_corpus.py
示例18: tokenize_text
def tokenize_text(page_text):
'''
Tokenizes text using NLTK and regEx
'''
pattern = r'''(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]'''
remove_list = ["[", "]", "{", "}", "(", ")",
"'", ".", "..", "...", ",", "?", "!",
"/", "\"", "\"", ";", ":", "-", "�", "_", "�", "�",
"`", "~", "@", "$", "^", "|", "#", "=", "*", "?"];
## making it to lower case may affect the performance
tokens = regexp_tokenize(page_text, pattern)
## Removes unnecessary words
wt = [w for w in tokens if ((w not in remove_list) and (len(w) > 1))];
return wt;
开发者ID:clintpgeorge,项目名称:hornbill,代码行数:17,代码来源:build_word_stream.py
示例19: countW
def countW(rootdir):
reviews = []
unique = []
for folder, subs, files in os.walk(rootdir):
for filename in files:
with open(os.path.join(folder,filename),'r') as src:
review = src.read()
words = regexp_tokenize(review,"\w+")
for word in words:
unique.append(word)
reviews.append(review)
unique = set(unique)
uniqueR = []
for w in unique:
if w not in stopwords.words('english'):
uniqueR.append(w)
print (len(set(uniqueR)))
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:17,代码来源:reader.py
示例20: calculateAGARI
def calculateAGARI(rootdir):
avgARI = 0
count = 0
uniqueWords = 0
personalRatio = 0
dollarCount = 0
personalPronouns = ["i","me","we","our","ours","mine"]
hotelName = 0
for folder, subs, files in os.walk(rootdir):
for filename in files:
with open(os.path.join(folder, filename), 'r') as src:
review = src.read()
personal = 0
sentences = sent_tokenize(review)
s = len(sentences)
capitals = 0
words = regexp_tokenize(review,"\w+")
for x in words:
if x in personalPronouns:
personal+=1
if x in hotels:
hotelName+=1
w = len(words)
unique = len(set(words))
uniqueWords+=unique
review = review.replace(" ","")
flag = "f"
for i in range(len(review)):
if review[i].isupper():
capitals+=1
if review[i] == '$':
flag = "t"
if flag=="t":
dollarCount+=1
c = len(review)
ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
avgARI += ari
count += 1
personalRatio += float(personal)/w
#print(nltk.ne_chunk(review))
print("\n"+rootdir)
print("ARI : "+str(float(avgARI/count)))
print("Unique words"+" "+str(uniqueWords/float(count)))
print("Ratio personal : "+str(personalRatio/float(count)))
print("DollarCount :"+str(dollarCount))
开发者ID:masterpurav,项目名称:DeceptionDetection,代码行数:45,代码来源:reader.py
注:本文中的nltk.tokenize.regexp_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论