本文整理汇总了Python中nltk.stem.lancaster.LancasterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python LancasterStemmer类的具体用法?Python LancasterStemmer怎么用?Python LancasterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LancasterStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: stem_tweet
def stem_tweet(tweet, stemmer_type = "lancaster"):
"""
:param tweet: string representing tweet
:param stemmer_type: type of stemmer used (default value is lancaster)
:return: stemmed tweet
:type tweet: str
:type stemmer_type: str
"""
tokens = nltk.word_tokenize(tweet)
stemmed_tokens = []
if stemmer_type == "lancaster":
stemmer = LancasterStemmer()
elif stemmer_type == "snowball":
stemmer = SnowballStemmer("english")
elif stemmer_type == "porter":
stemmer = PorterStemmer()
elif stemmer_type == "regexp":
stemmer = RegexpStemmer("english")
else:
return None
for token in tokens:
stemmed_tokens.append(stemmer.stem(token))
ret_tw = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in stemmed_tokens]).strip()
return ret_tw
开发者ID:GavriloDrljaca,项目名称:ANNProject,代码行数:26,代码来源:nltk_manipulation.py
示例2: stem_text
def stem_text(text):
stm = LancasterStemmer()
tokens = text.split()
words = [stm.stem(w) for w in tokens]
snt = " ".join(words)
return snt
开发者ID:uml-cs-nlp-sentence-completion,项目名称:Sherlock,代码行数:7,代码来源:process_file.py
示例3: lemmatizer_newsheadlines
def lemmatizer_newsheadlines() :
lancaster_stemmer = LancasterStemmer()
frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU")
fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU")
fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w")
for headline in fr:
if len(headline)>0:
headlinelist=headline.split(",")
if len(headlinelist)==3:
headlinewords=headlinelist[1].split(" ")
print(headlinewords)
for word in headlinewords:
wordcor=(((word.replace("?","")).replace(":","")).replace("\"",""))
headlineword=(lancaster_stemmer.stem(wordcor)).lower()
print(headlineword)
# for line in frl:
# crimelist=line.split(",")
# crimeword=((crimelist[1].replace("\"","")).strip()).lower()
# print(crimeword+str(i))
# i+=1
dictcrime=lemmadict()
if headlineword in dictcrime:
print(headlineword+"yipee")
fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n")
break;
frl.close()
fw.close()
fr.close()
开发者ID:22parthgupta18,项目名称:Crime_Visualization,代码行数:32,代码来源:lemmatizer.py
示例4: simplify_old
def simplify_old(s):
res = ''
st = LancasterStemmer()
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
for tag in tags:
word = tag[0]
if f.checkPos(tag[1]):
if word in model:
word_stem = st.stem(word)
top_words = model.most_similar(positive=[word], topn = 20)
candidate_list = [w[0] for w in top_words]
freq_list = [fdist[w] for w in candidate_list]
c_f_list = zip(candidate_list, freq_list)
ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
word_freq = fdist[word]
# synonmys = f.getSynonmys(word) ## get synonmys from wordnet
# print synonmys
for w in ordered_list:
if not f.freq_diff(word_freq, w[1]): ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
break
if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
word = w[0] ### do not use wordnet
# if w[0] in synonmys:
# word = w[0]
# else:
# for syn in synonmys:
# if st.stem(w[0]) == st.stem(syn):
# word = w[0]
res = res + word + ' '
return res
开发者ID:wufei523,项目名称:SimpleTestUmb,代码行数:34,代码来源:utils.py
示例5: filt
def filt(string):
ret = string
# Filter all punctuation from string
for p in punctuation:
ret = ret.replace(p, '')
# Replace hyphens with spaces
ret = ret.replace('-', ' ')
oldret = ret
ret = ""
# Filter all stop words from string
for word in oldret.split():
if (word in allStopWords) or len (word) <= 1:
pass
else:
ret += word.lower() + " "
st = LancasterStemmer()
steamed = ""
for word in ret.split():
try:
steamed += str(st.stem(word)) + " "
except UnicodeDecodeError:
pass
return steamed
开发者ID:mitzelu,项目名称:lexical_analysis_tex,代码行数:31,代码来源:mrtitlefreq.py
示例6: mapper
def mapper():
#list of fields in positional order expected in inbound
#forum node data.
fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body',
'node_type', 'parent_id', 'abs_parent_id',
'added_at', 'score', 'state_string', 'last_edited_id',
'last_activity_by_id', 'last_activity_at',
'active_revision_id', 'extra', 'extra_ref_id',
'extra_count', 'marked']
reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames)
stemmer = LancasterStemmer()
stopw = stopwords.words('english')
split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]')
for line in reader:
pid = line['id']
body = line['body']
# split body into words
words = split_pattern.split(body)
# map the stemmer function across all the words.
# and use the Counter to create a dict
# of counted stems. Remove english stopwords.
stem_counts = Counter((stemmer.stem(x) for x in words if x not in stopw))
# emit the stem, count and node id
# for reduction into the reverse index
for stem, count in stem_counts.items():
print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
开发者ID:wgberger,项目名称:UdacityIntroToMR,代码行数:33,代码来源:mapper.py
示例7: preprocess
def preprocess(reviews):
import nltk
from nltk.tokenize import word_tokenize
review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]
#print "review tokenize done"
#remove stop words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
#print 'remove stop words done'
#remove punctuations
english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
#print 'remove punctuations done'
#stemming
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
#print 'stemming done'
return review_stemmed
开发者ID:anirudhreddy92,项目名称:DataMining_Capstone,代码行数:25,代码来源:task3.1.py
示例8: preprocess
def preprocess(content):
stopset = set(stopwords.words('english'))
#replace punctuation and tag with space
tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower()))
pos_list = pos_tag(tokens)
s_tokens = list()
#noun and verb only
for pos in pos_list:
#print pos[1]
#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
if pos[1] in ['NN', 'NNS']:
s_tokens.append(pos[0])
wordfreq = FreqDist(s_tokens)
stemfreq = dict()
st = LancasterStemmer()
for word, freq in wordfreq.items():
#stopwords
if word in stopset:
del wordfreq[word]
continue
#tiny words
if len(word) <= 2:
del wordfreq[word]
continue
#stemmer
stem = st.stem(word)
try:
stemfreq[stem]+=freq
except:
stemfreq[stem]=freq
return stemfreq
开发者ID:TorchmanX,项目名称:TARS,代码行数:33,代码来源:nc.py
示例9: processRawData
def processRawData(self, inputPath, outputPath):
raw = pickle.load(open(inputPath, "r"))
data = []
genres = set([])
count = 0
st = LancasterStemmer()
for key in raw.keys():
movie = raw[key]
# if no genre or synopsis data
if 'genres' not in movie or 'synopsis' not in movie: continue
if len(movie['genres'])==0 or movie['synopsis'] == '': continue
temp = {}
temp['genres'] = movie['genres']
for g in temp['genres']:
genres.add(g)
# trim out the punctuation and transform to lowercase
#replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
s = str(movie['synopsis'])
s = s.translate(string.maketrans("",""), string.punctuation)
s = re.sub(' +', ' ', s).strip()
s = " ".join(st.stem(word) for word in s.split(" "))
temp['synopsis'] = s.lower()
data.append(temp)
count += 1
# output as a pickle file
file = open(outputPath, 'wb')
pickle.dump(data, file)
print 'processed ' + str(count) + ' movies'
return genres
开发者ID:calvinyu,项目名称:SEA-final-project,代码行数:29,代码来源:trainer.py
示例10: parse_validation
def parse_validation(validation_path):
validation_list = []
with open(validation_path) as f:
for line in f:
strs = line.split('|')
word_dict = {}
validation_list.append(word_dict)
word_dict["word"] = strs[0].strip()
word_dict["real_sense"] = int(strs[1])
sentence_list = []
word_dict["sentence"] = sentence_list
lmtzr = WordNetLemmatizer()
ls = LancasterStemmer()
single_words = re.findall("(\w+|%%)",strs[2])
double_mod_found = False
word_count = 0
for single_word in single_words:
if single_word == "%%":
if not double_mod_found:
word_dict["target_word_idx"] = word_count+1
double_mod_found = True
continue
lemmed = lmtzr.lemmatize(single_word)
stemmed = ls.stem(lemmed)
if not stemmed in glob_Lucene:
sentence_list.append(stemmed)
word_count += 1
return validation_list
开发者ID:joycez,项目名称:NLP_Proj2,代码行数:30,代码来源:dic_preprocessing.py
示例11: getMaybeWords
def getMaybeWords(self, text_ls):
ignoreWords = ["","have","her","there","the","be","to","of","and","a","in","that","it","for","on","with","as","at","this","but","his","by","from","they","or","an","will","would","so","even","is","be","am","are"];
word_ls = []
for text in text_ls:
word_ls += wordpunct_tokenize(text)
frequencies = {}
st = LancasterStemmer()
for word in word_ls:
if not word[0].isalpha():
continue
if word in ignoreWords:
continue
word_stem = st.stem(word)
if word_stem in frequencies:
frequencies[word_stem] += 1
else:
frequencies[word_stem] = 1
sorted_frequencies = sorted(frequencies.iteritems(), key = operator.itemgetter(1), reverse = True)
#print sorted_frequencies
max_words = 30
if len(sorted_frequencies) < max_words:
max_words = len(sorted_frequencies)
word_tuples = sorted_frequencies[0:max_words]
words = [tuple[0] for tuple in word_tuples]
print words
return words
开发者ID:schasins,项目名称:school-program-scraping,代码行数:30,代码来源:sfusd_demo.py
示例12: build_analyzer
def build_analyzer(self):
"""
Return a callable that handles preprocessing and tokenization
"""
preprocess = self.build_preprocessor()
tokenize = self.build_tokenizer()
stemmer = LancasterStemmer()
filter_meta = lambda doc: ' '.join([w for w in doc.split() if not w.startswith('~')])
parse_words = lambda doc: tokenize(preprocess(filter_meta(self.decode(doc))))
stem_words = lambda doc: [stemmer.stem(t) for t in parse_words(doc)]
meta_func = lambda prefix: lambda doc: (t for t in self.decode(doc).split() if t.startswith(prefix))
feat_func_map = {
'word': lambda doc: self._word_ngrams(parse_words(doc), self.get_stop_words()),
'stem': lambda doc: self._word_ngrams(stem_words(doc), self.get_stop_words()),
'1st': lambda doc: ('~T:1st' for i in parse_words(doc) if i in first_person_words),
'3rd': lambda doc: ('~T:3rd' for i in parse_words(doc) if i in third_person_words),
'tag': lambda doc: self._word_ngrams([t[1] for t in nltk.pos_tag(parse_words(doc))]),
'length': lambda doc: ['~L:%d' % (len(parse_words(doc)) / 5)],
'genre': meta_func('~G'),
'rating': meta_func('~Ra'),
'votes': meta_func('~V'),
'lang': meta_func('~La'),
'country': meta_func('~Co'),
'year': meta_func('~Y'),
'runtime': meta_func('~Rt'),
'type': meta_func('~T')
}
func_list = [feat_func_map.get(flag.strip()) for flag in self.analyzer.split(':')] \
if type(self.analyzer) is str else None
if not func_list:
raise ValueError('%s is not a valid tokenization scheme/analyzer' % self.analyzer)
else:
return lambda doc: itertools.chain.from_iterable(f(doc) for f in func_list if callable(f))
开发者ID:sevengram,项目名称:ml-hw,代码行数:35,代码来源:classify.py
示例13: readText
def readText(textFile):
examples = []
count = 0
lexicon_en = {}
lexicon_ge = {}
stem_en = LancasterStemmer()
stem_ge = nltk.stem.snowball.GermanStemmer()
for line in open(textFile):
count+=1
if count % 1000 == 0:
print count
lans = line.lower().strip().split("|||")
#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
german = lans[0].strip().split(" ")
german = process(german)
for wordx in german:
for word in wordx:
if word not in lexicon_ge:
lexicon_ge[word]=1
else:
lexicon_ge[word]+=1
eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
#parse_en = pattern.en.parse(" ".join(eng))
eng = lans[1].strip().split(" ")
for word in eng:
if word not in lexicon_en:
lexicon_en[word]=1
else:
lexicon_en[word]+=1
examples.append(Example(german,eng))
return examples, lexicon_en, lexicon_ge
开发者ID:frederick0329,项目名称:sp2016.11-731,代码行数:31,代码来源:align-compound.py
示例14: prepare_corpus
def prepare_corpus(raw_documents):
# remove punctuation
print "Removing Punctuation"
import string
exclude = set(string.punctuation)
raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents]
# remove common words
print "Calculating Stoplist"
stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")])
stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english")))
# print stoplist
print "Removing Stoplist and Stemming"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist]
for document in raw_documents]
# remove words that appear only once
print "Removing Single Variables"
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
return texts
开发者ID:showandtellinar,项目名称:harbinger,代码行数:29,代码来源:main.py
示例15: tokenize_rest
def tokenize_rest(text):
wnl = WordNetLemmatizer()
st = LancasterStemmer()
words = nltk.word_tokenize(text)
postag = nltk.pos_tag(words)
tokens = []
whfound=False
for word in words:
if word[0:2].lower() == 'wh' and not whfound:
tokens.append({word.lower():'wh'})
whfound = True
continue
elem=wnl.lemmatize(word)
stem = st.stem(elem)
synd = wn.synsets(stem)
if not synd:
stem = stemmer(elem)
synd = wn.synsets(stem)
if not synd:
stem = elem
synd = wn.synsets(stem)
dbelement=detect(stem)
if dbelement:
for every_elem in dbelement:
tokens.append({word:every_elem})
print "\n Rest of possible Tokens"
print tokens
return tokens
开发者ID:kushalbhabra,项目名称:nltk-movie-db,代码行数:29,代码来源:tokens.py
示例16: get_pretrained_vector
def get_pretrained_vector(session, word2vec_model, vocab_path, vocab_size, vectors):
print(vectors)
with gfile.GFile(vocab_path, mode="r") as vocab_file:
st = LancasterStemmer()
counter = 0
counter_w2v = 0.0
while counter < vocab_size:
vocab_w = vocab_file.readline().replace("\n", "")
# vocab_w = st.stem(vocab_w)
# for each word in vocabulary check if w2v vector exist and inject.
# otherwise dont change value initialise randomly.
if word2vec_model and vocab_w and word2vec_model.__contains__(vocab_w) and counter > 3:
w2w_word_vector = word2vec_model.get_vector(vocab_w)
print("word:%s c:%i w2v size %i" % (vocab_w, counter, w2w_word_vector.size))
vectors[counter] = w2w_word_vector
counter_w2v += 1
else:
vocab_w_st = st.stem(vocab_w)
if word2vec_model and vocab_w_st and word2vec_model.__contains__(vocab_w_st):
w2w_word_vector = word2vec_model.get_vector(vocab_w_st)
print("st_word:%s c:%i w2v size %i" % (vocab_w_st, counter, w2w_word_vector.size))
vectors[counter] = w2w_word_vector
counter_w2v += 1
else:
if not vocab_w:
print("no more words.")
break
counter += 1
print("injected %f per cent" % (100 * counter_w2v / counter))
print(vectors)
return vectors
开发者ID:jonathanmanfield,项目名称:deepreferendum,代码行数:33,代码来源:embeddings_utils.py
示例17: process
def process(reviews):
#separate splitor
from nltk.tokenize import word_tokenize
review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]
#remove stop words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
#remove punctuations
english_punctuations = [',','.','...', ':',';','?','(',')','&','!','@','#','$','%']
review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
#stemming
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
#remove word whose frequency is less than 5
all_stems = sum(review_stemmed, [])
stems_lt_three = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
final_review = [[stem for stem in text if stem not in stems_lt_three] for text in review_stemmed]
return final_review
开发者ID:anirudhreddy92,项目名称:DataMining_Capstone,代码行数:26,代码来源:task6.py
示例18: train_lsi_model
def train_lsi_model(self, texts, num_of_toptics=10):
texts_tokenized = [[word.lower()
for word in word_tokenize(text)]
for text in texts]
# remove the stop words and punctuations
english_stop_words = stopwords.words('english')
english_punctuations = [',', '.', ':', '?', '(', ')', '[',
']', '@', '&', '!', '*', '#', '$', '%']
texts_filtered = [[word for word in text_tokenized
if (not word in english_punctuations) and
(not word in english_stop_words)]
for text_tokenized in texts_tokenized]
# stem the word
st = LancasterStemmer()
texts_stemed = [[st.stem(word) for word in text_filtered]
for text_filtered in texts_filtered]
all_stems = sum(texts_stemed, [])
stem_once = set(stem for stem in set(all_stems)
if all_stems.count(stem) == 1)
cleaned_texts = [[stem for stem in text if stem not in stem_once]
for text in texts_stemed]
dictionary = corpora.Dictionary(cleaned_texts)
corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
num_topics=num_of_toptics)
result = lsi[corpus]
return result
开发者ID:Nanguage,项目名称:pubmed_xml_analyze,代码行数:31,代码来源:similarity.py
示例19: lemmstem
def lemmstem(sentences):
''' This function is responsible for perfoming
the lemmarization and stemming of the words
Input: A list of trees containing the sentences.
All words are classificated by their NE type
Output: Lemmatized/Stemmized sentences
'''
lmtzr = WordNetLemmatizer()
st = LancasterStemmer()
dic = {'VB' :wordnet.VERB,
'NN': wordnet.NOUN,
'JJ':wordnet.ADJ,
'RB':wordnet.ADV }
for sent in sentences:
lvsidx=sent.treepositions('leaves')
for pos in lvsidx:
word=sent[pos][0]
tag = sent[pos][1]
rtag = tag[0:2]
if rtag in dic:
lemm=lmtzr.lemmatize( word, dic[rtag] )
stem=st.stem(lemm)
#print word, lemm, stem #Linia maldita
sent[pos]=(word, tag, stem)
else:
sent[pos]=(word, tag, word)
return sentences
开发者ID:picarus,项目名称:MAI-INLP-ALB5,代码行数:33,代码来源:preprocessing_functions.py
示例20: word_standardize
def word_standardize(sentences):
tokens = []
sentences_st = []
for sent in sentences:
tokens.extend(word_tokenize(sent))
sentences_st.append(word_tokenize(sent))
words = tokens
st = LancasterStemmer()
words = [w.lower() for w in words]
words = [w for w in words if not w in stopwords.words('english')]
words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
st_words = [st.stem(w) for w in words]
sent_result = []
for sent in sentences_st:
sent = [w.lower() for w in sent]
sent = [w for w in sent if not w in stopwords.words('english')]
sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
sent_result.append(sent)
return st_words, sent_result
开发者ID:chqsark,项目名称:hightext,代码行数:25,代码来源:pullData.py
注:本文中的nltk.stem.lancaster.LancasterStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论