本文整理汇总了Python中nltk.tokenize.RegexpTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python RegexpTokenizer类的具体用法?Python RegexpTokenizer怎么用?Python RegexpTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RegexpTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: preprocess
def preprocess(TWEETS, typeTweet):
wordlist = []
tokenizer = RegexpTokenizer(r'#?\w+')
#normalize text -- TOKENIZE USING REGEX TOKENIZER
cnt = 0
for item in TWEETS:
text = TWEETS[cnt]
tweet = ''.join(text)
tweet = tweet.lower().strip('\n')
tweet = re.sub(r'[0-9]+', "" , tweet)
tweet = re.sub(r'@[^\s]+', "" , tweet)
tweet = re.sub(r'#\w+primary', "" , tweet)
wordlist.extend(tokenizer.tokenize(tweet))
cnt += 1
#remove stopwords
stop = stopwords.words('english') + ['rt', 'via', 'u', 'r', 'b', '2', 'http',
'https', 'co', 'live', 'hall', 'town', 'watch',
'tune', 'time', 'tonight', 'today', 'campaign',
'debate', 'wants', 'without', 'dont',
'#hillaryclinton', '#berniesanders', '#donaldtrump',
'#tedcruz', "#johnkasich", '#politics']
filtered = [term for term in wordlist if term not in stop]
filtered_final = [term for term in filtered if len(term)>3]
print 'Preprocessed %s tweets' % (typeTweet)
return filtered_final
开发者ID:martinezmonica123,项目名称:Twitter-Sentiment-Analysis,代码行数:27,代码来源:text_analysis_tweets.py
示例2: lda
def lda(data):
data = get_only_text(data)
only_tweet = data
length = len(only_tweet)
length = min(20,length)
for i in xrange(0,length):
print i
print only_tweet[i]
return
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
length = len(only_tweet)
length = min(20,length)
total_texts = []
for i in xrange(0,length):
print only_tweet[i]
print
to_lower = only_tweet[i].lower()
tokens = tokenizer.tokenize(to_lower)
stopped_tokens = [k for k in tokens if not k in en_stop]
texts = [p_stemmer.stem(k) for k in stopped_tokens]
total_texts.append(texts)
dictionary = corpora.Dictionary(total_texts)
corpus = [dictionary.doc2bow(text) for text in total_texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
result = ldamodel.print_topics(num_topics=2, num_words=1)
for i in result:
print i
开发者ID:ChilupuriAnilReddy,项目名称:SMAI_Major_Project,代码行数:33,代码来源:Analysing_Data.py
示例3: textToWordList
def textToWordList(txt):
p_stemmer = RussianStemmer()
tokenizer = RegexpTokenizer(r'\w+')
stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
r = re.compile('^[а-я]+$')
badword =[
'дом',
'город',
"дорог",
"час",
"ноч",
"слов",
"утр",
"стран",
"пут",
"путешеств",
"мест",
'нов',
"друз",
"добр"
]
txt = txt.lower().replace("<br>", "\n")
tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
return tokens
开发者ID:Omrigan,项目名称:travelrec,代码行数:25,代码来源:views.py
示例4: Tokenize
def Tokenize(TextData):
tokenizer = RegexpTokenizer(r'\w+')
tokens = list()
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# clean and tokenize document string
raw = TextData.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
tokens = stemmed_tokens
TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
fp = open(TOKENIZEDTEXT_FILE, "w")
print(TOKENIZEDTEXT_FILE)
# pickle.dump(tokens, fp)
fp.write(str(tokens))
fp.close()
开发者ID:prathmeshgat,项目名称:SuicidalPersonDetection,代码行数:27,代码来源:AudioToText.py
示例5: tokenize
def tokenize(self, doc):
'''
use NLTK RegexpTokenizer
'''
tokenizer = RegexpTokenizer("\w{3,}")
return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
开发者ID:Hongtian22,项目名称:Movier,代码行数:7,代码来源:movier.py
示例6: text_process
def text_process(text):
'''
Takes in a string of text, then performs the following
1. Tokenizes and removes punctuation
2. Removes stopwords
3. Stems
4. Returns a list of the cleaned text
'''
if(pd.isnull(text)):
return []
# Tokenize
tokenizer = RegexpTokenizer(r'\w+')
text_processed = tokenizer.tokenize(text)
# Removing any stopwords
text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
# Stemming
porterStemmer = PorterStemmer()
text_processed = [porterStemmer.stem(word) for word in text_processed]
try:
text_processed.remove('b')
except:
pass
return " ".join(text_processed)
开发者ID:shankarchari,项目名称:data_science,代码行数:30,代码来源:process_data.py
示例7: trainMarkovChain
def trainMarkovChain(self, n = 1):
self.ngram_degree = n
self.markov_model = defaultdict(lambda : defaultdict(int))
sentences = self.corpus_sentences
if sentences is None:
sentences = self.sentenceTokenizeCorpus()
print("Training markov model on corpus.")
word_tokenizer = RegexpTokenizer(r"\w+")
for sentence in sentences:
words = word_tokenizer.tokenize(sentence)
last_word_list = ["#"] * n
for word in words:
last_token = " ".join(last_word_list)
self.markov_model[last_token][word] += 1
last_word_list.append(word)
last_word_list = last_word_list[1:]
last_token = " ".join(last_word_list)
self.markov_model[last_token]["#"] += 1
开发者ID:iangonzalez,项目名称:NaNoGenMo,代码行数:28,代码来源:NaNoGenMo.py
示例8: __init__
def __init__(self, oldid, newid, data, general):
self.newid=newid
self.oldid=oldid
self.data=data
self.tfidfatt=[]
self.tfidfval=[]
self.freatt=[]
self.freval=[]
self.text=''
self.ntlk=[]
self.idfvalue=[]
self.general=general
tokenizer = RegexpTokenizer(r'\w+')
#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
stop = stopwords.words('english')
for r in tokenizer.tokenize(data):
a=0
if r not in stop:
if not any(i.isdigit() for i in r):
r = stemmer.stem(r)
if r not in self.ntlk:
self.ntlk.append(r)
self.text=self.text+' '+r
开发者ID:t1mch0w,项目名称:CSE5243,代码行数:26,代码来源:reuter.py
示例9: mean_stdDeviation
def mean_stdDeviation(self,query,stopWordInstruction):
list_count_postTitles = []
list_postTitles = self.data[:][query].tolist()
tokenizer = RegexpTokenizer(r'\w+')
stopwords_mine = []
#a.encode('ascii','ignore')
stopwords_mine+= (word.encode('ascii','ignore') for word in stopwords.words('english'))
tokenized_list = []
new_list_tokenized = []
for item in list_postTitles:
tokenized_list.append(tokenizer.tokenize(item))
if stopWordInstruction==True:
for item in tokenized_list:
temp = []
temp += (word for word in item if word.lower() not in stopwords_mine)
#print temp
#raw_input()
new_list_tokenized.append(temp)
else:
new_list_tokenized=copy.deepcopy(tokenized_list)
for x in new_list_tokenized:
list_count_postTitles.append(len(x))
#print list_count_postTitles
npArray = np.asarray(list_count_postTitles)
print npArray.mean()
print npArray.std()
return [npArray.mean(),npArray.std(),list_postTitles,list_count_postTitles]
开发者ID:akshay0193,项目名称:SAH,代码行数:32,代码来源:assignment1.py
示例10: issue_analysis
def issue_analysis(df):
df_sub = df[['Issue']]
df_sub.insert(0, 'count', 1)
Issue_List=[]
for i in range(0,50):
Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)
tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg
en_stop = get_stop_words('en') # create English stop words list
p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer
texts = [] # list for tokenized documents in loop
text_view = ''
# loop through document list
for i in Issue_List:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens and add them to list
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
#print ' '.join(stemmed_tokens)
text_view += ' '.join(stemmed_tokens)
text_view += ' '
wordcloud = WordCloud().generate(text_view)
fig = plt.figure(figsize=(8,6))
fig1 = fig.add_subplot(1,1,1)
fig1.set_title("Top issued words", fontdict={'fontsize':25})
fig1.imshow(wordcloud)
fig1.axis("off")
#plt.savefig('ComplainCount_WC.png')
plt.savefig('ComplainCount_WC_2016.png')
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
#print "\n Topic analysis result for top 25 issues with LDA"
#print(LDAText)
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.show(vis_data)
#pyLDAvis.save_html(vis_data, "issue_lda.html")
#pyLDAvis.save_json(vis_data, "issue_lda.json")
pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
pyLDAvis.save_json(vis_data, "issue_lda_2016.json")
return 0
开发者ID:choi-junhwan,项目名称:ConsumerComplaintsDataProject,代码行数:60,代码来源:Complaints_TextAnalysis.py
示例11: stripped_words
def stripped_words(self, original_sentence):
_sentence = filter(self.printable_char_filter, original_sentence)
_sentence = _sentence.replace(u'\u2013', ' ')
_sentence = _sentence.replace(u'\u2014', ' ')
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = tokenizer.tokenize(_sentence)
return [word.lower() for word in tokens if word.lower() not in stop_words]
开发者ID:bdenglish,项目名称:article_summarizer,代码行数:7,代码来源:article_summarizer.py
示例12: relevance_features
def relevance_features(doc):
print "relfeatures"
print doc[:10]
features={}
#print doc
#Test 1 : Has synonyms of NIT Warangal
features['contains synonym']='false'
for word in synonyms:
if word in doc:
features['contains synonym']='true'
break
#Test 2 : Has a person name that appears in Almabase's DB
count=0
names=ner.get_names(data)
count=ner.query_db(names)
print 'count is {}'.format(count)
# if count==0:
# features['hasAlumnus']='none'
# elif count<=3:
# features['hasAlumnus']='medium'
# elif count>3:
# features['hasAlumnus']='high'
# print count
#Test 3: Bag of words approach
tokenizer = RegexpTokenizer(r'\w+')
document_words=tokenizer.tokenize(doc)
for word in word_features:
if word.lower() in document_words:
print "{} is present".format(word)
features['contains({})'.format(word.lower())] = (word in document_words)
return features
开发者ID:Pr1yanka,项目名称:Smart-News-Scraper-1,代码行数:34,代码来源:classifier.py
示例13: preprocess_wikidata
def preprocess_wikidata(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower().split('../img/')[0]
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
开发者ID:DailyActie,项目名称:AI_APP_CV-TextTopicNet,代码行数:31,代码来源:preprocess_text.py
示例14: get_product_vocab
def get_product_vocab(dict_queries):
tok = RegexpTokenizer(r'\w+')
vocab = {}
for query,v in dict_queries.items():
words = defaultdict(int)
for prod in v:
w_prod = tok.tokenize(prod[1])
for w in w_prod:
#wt = stem(wt)
if not re.match(r'\d+$', w) and \
len(w) > 1 and \
w not in stop_words:
words[w] += 1
vocab[query] = words.keys()
#vocab[query] = [k for (k, v) in words.iteritems() if v > 1]
"""
print "Query: " + query
sorted_w = sorted(words.items(), key=lambda x:x[1], reverse=True)
print sorted_w
"""
return vocab
开发者ID:gbakie,项目名称:kaggle-cf-search,代码行数:26,代码来源:multi_svm_model.py
示例15: write_summary
def write_summary(texts, ofile):
word_tokenizer = RegexpTokenizer(r"\w+")
with codecs.open(ofile, u"w", u"utf-8") as f:
for text in texts:
f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)]))
f.write(u"\n")
f.flush()
开发者ID:kedz,项目名称:cuttsum,代码行数:7,代码来源:build_model_summaries.py
示例16: count_ngrams
def count_ngrams(sessions,length):
data = sessions
data = data.replace(',',' ')
tokenizer = RegexpTokenizer("[0-9]+")
#include only number (pageIDs) for tokens
token = tokenizer.tokenize(data)
from nltk.util import ngrams
#print list(ngrams(token, 2))
generated_ngrams = list(ngrams(token,length))
#print generated_ngrams
try:
ngrams = ' '.join(generated_ngrams[0])
except IndexError:
global non_list
non_list += 1
#print 'Failed generated ngrams as there is no minimum '
# print ngrams
for ngram in generated_ngrams:
if not ngrams_statistics.has_key(ngram):
ngrams_statistics.update({ngram:1})
else:
ngram_occurrences = ngrams_statistics[ngram]
ngrams_statistics.update({ngram:ngram_occurrences+1})
开发者ID:Madhuka,项目名称:episode-mining,代码行数:27,代码来源:list_page_sequences.py
示例17: preprocess
def preprocess(sentence):
sentence = sentence.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
#filtered_words = filter(lambda token: token not in stopwords.words('english'))
return " ".join(filtered_words)
开发者ID:Yelrose,项目名称:liblinear_20NewsGroup,代码行数:7,代码来源:pipline.py
示例18: run
def run(self):
"""
How do I run this Task?
Luigi will call this method if the Task needs to be run.
"""
# remove stop words and punctuation
stop = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
wordnet = WordNetLemmatizer()
docs = []
#ipdb.set_trace()
for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects
lines = 0
words = []
for line in f.open('r'):
if lines == 0:
label = line
lines +=1
else:
words.extend(tokenizer.tokenize(line))
lines +=1
words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')]
docs.append((label, '\t'.join(words)))
out = self.output().open('w')
for label, tokens in docs:
out.write("%s,%s\n" % (label.strip(), tokens.strip()))
out.close()
开发者ID:DATAQC,项目名称:data-engineering-101,代码行数:33,代码来源:ml-pipeline.py
示例19: run
def run(self, data):
results = []
tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)
for corpus in data:
corpus.contents = " ".join(tokenizer.tokenize(corpus.contents))
results.append(corpus)
return results
开发者ID:kmp3325,项目名称:linguine-python,代码行数:7,代码来源:remove_punct.py
示例20: lemmatizeall
def lemmatizeall(word_list):
""" Lemmatizes the word_list passing through each type of word
Input:
word_list - list of words to be cleaned
pos options: ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
"""
word_types = "v", "a", "n", "s", "r"
#print(word_types)
#ipdb.set_trace()
wnl = nltk.WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
for x in range(0, len(word_list)):
word_tokens = tokenizer.tokenize(str(word_list[x]))
word_tokens_lem = word_tokens
for i in range(0, len(word_types)):
pos = word_types[i]
word_tokens_lem = [wnl.lemmatize(w, pos=pos) for w in word_tokens_lem]
sep = " "
word_list[x] = sep.join(word_tokens_lem)
#print(i)
return word_list #[wnl.lemmatize(w, pos=pos) for w in word_list]
开发者ID:AnnaMag,项目名称:ncvo-s2ds-2015,代码行数:28,代码来源:text_processing.py
注:本文中的nltk.tokenize.RegexpTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论