本文整理汇总了Python中nltk.stem.WordNetLemmatizer类的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer类的具体用法?Python WordNetLemmatizer怎么用?Python WordNetLemmatizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了WordNetLemmatizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: bow_score
def bow_score(hypothesis_list,text_list):
wordnet_lemmatizer = WordNetLemmatizer()
stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
i = 0
while i < len(hypothesis_list):
if hypothesis_list[i] in stop_word_list:
del hypothesis_list[i]
i = i - 1
i = i + 1
if len(hypothesis_list) == 0:
return 0
i = 0
while i < len(text_list):
if text_list[i] in stop_word_list:
del text_list[i]
i = i - 1
i = i + 1
if len(text_list) == 0:
return 0
## Stop words removed up until here
score = 0
for word_text in text_list:
lemma_text = wordnet_lemmatizer.lemmatize(word_text)
for word_hypothesis in hypothesis_list:
lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
print lemma_hypothesis
print lemma_text
score += lexical_compare(lemma_text,lemma_hypothesis)
print str(score)
return score
开发者ID:racoder,项目名称:question-answering-nlp,代码行数:31,代码来源:BOW_1.py
示例2: labelBasedEntry
def labelBasedEntry(term,uri):
wnl = WordNetLemmatizer()
hm = {}
sparql = Sparql.Connection()
if " " in term:
term = term.split(" ")[1]
stem = wnl.lemmatize(term)
wiktionary_informations = sparql.getWiktionaryInformationsNEW(stem)
for x in wiktionary_informations:
if " + " in x[0] and "," not in x[0] and "*" not in x[0]:
tmp = x[0].split(" + ")[0]
if "Adjective" in x[1]:
hm[LexiconGenerator.AdjectivePPFrame(tmp, uri,{})] = ""
if "Verb" in x[1]:
hm[LexiconGenerator.TransitiveFrame(tmp, uri,{})] = ""
if "Noun" in x[1]:
hm[LexiconGenerator.NounPPFrame(tmp,uri,{})] = ""
elif "," not in x[0] and "*" not in x[0]:
if "Adjective" in x[1]:
hm[LexiconGenerator.AdjectivePPFrame(term, uri,{})] = ""
if "Verb" in x[1]:
hm[LexiconGenerator.TransitiveFrame(term, uri,{})] = ""
if "Noun" in x[1]:
hm[LexiconGenerator.NounPPFrame(term,uri,{})] = ""
if len(wiktionary_informations) == 0:
hm[LexiconGenerator.TransitiveFrame(stem, uri,{})] = ""
hm[LexiconGenerator.NounPPFrame(stem,uri,{})] = ""
entry = []
for key in hm:
entry.append(key)
return entry
开发者ID:swalter2,项目名称:knowledgeLexicalisation,代码行数:35,代码来源:LabelApproach.py
示例3: stemming
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
supported_stemmers = [
"PorterStemmer", "SnowballStemmer",
"LancasterStemmer", "WordNetLemmatizer"]
if type is False or type not in supported_stemmers:
return words_l
else:
l = []
if type == "PorterStemmer":
stemmer = PorterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "SnowballStemmer":
stemmer = SnowballStemmer(lang)
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "LancasterStemmer":
stemmer = LancasterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "WordNetLemmatizer": # TODO: context
wnl = WordNetLemmatizer()
for word in words_l:
l.append(wnl.lemmatize(word).encode(encoding))
return l
开发者ID:LewkowskiArkadiusz,项目名称:magisterka,代码行数:25,代码来源:preprocessing.py
示例4: getWordCounts
def getWordCounts(WordCloudTweetNo):
print('Fetching the most commonly used {0} words in the "{1}" feed...'.format(WordCloudTweetNo, ScreenName))
cur = "DELETE FROM WordsCount;"
conn.execute(cur)
conn.commit()
cur = 'SELECT tweet_text FROM UserTimeline'
data = conn.execute(cur)
StopList = stopwords.words('english')
Lem = WordNetLemmatizer()
AllWords = ''
for w in tqdm(data.fetchall(),leave=1):
try:
#remove certain characters and strings
CleanWordList = re.sub(r'http://[\w.]+/+[\w.]+', "", w[0], re.IGNORECASE)
CleanWordList = re.sub(r'https://[\w.]+/+[\w.]+', "", CleanWordList, re.IGNORECASE)
CleanWordList = re.sub(r'[@#\[\]\'"$.;{}~`<>:%&^*()-?_!,+=]', "", CleanWordList)
#tokenize and convert to lower case
CleanWordList = [words.lower() for words in word_tokenize(CleanWordList) if words not in StopList]
#lemmatize words
CleanWordList = [Lem.lemmatize(word) for word in CleanWordList]
#join words
CleanWordList =' '.join(CleanWordList)
AllWords += CleanWordList
except Exception as e:
print (e)
sys.exit(e)
if AllWords is not None:
words = [word for word in AllWords.split()]
c = Counter(words)
for word, count in c.most_common(WordCloudTweetNo):
conn.execute("INSERT INTO WordsCount (word, frequency) VALUES (?,?)", (word, count))
conn.commit()
开发者ID:keyur9,项目名称:Examining-your-presence-on-Twitter,代码行数:32,代码来源:Twitter_Scrape.py
示例5: preprocess
def preprocess(line, is_lmz=False):
line = wordpunct_tokenize(line.strip())
if is_lmz:
lemmatizer = WordNetLemmatizer()
line = [lemmatizer.lemmatize(word) for word in line]
return line
开发者ID:lngvietthang,项目名称:imageqa,代码行数:7,代码来源:quest2num.py
示例6: create_lexicon
def create_lexicon(pos_file, neg_file):
lex = []
# 读取文件
def process_file(_f):
with open(_f, 'r') as f:
lex = []
lines = f.readlines()
# print(lines)
for line in lines:
words = word_tokenize(line.lower())
lex += words
return lex
lex += process_file(pos_file)
lex += process_file(neg_file)
# print(len(lex))
lemmatizer = WordNetLemmatizer()
lex = [lemmatizer.lemmatize(word) for word in lex] # 词形还原 (cats->cat)
word_count = Counter(lex)
# print(word_count)
# {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
# 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
lex = []
for word in word_count:
if word_count[word] < 2000 and word_count[word] > 20: # 这写死了,好像能用百分比
lex.append(word) # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
return lex
开发者ID:gswyhq,项目名称:hello-world,代码行数:29,代码来源:TensorFlow练习1,对评论进行分类.py
示例7: split_into_words
def split_into_words(text, lemmatize=False, reattach=True, replace_numbers=True, split_off_quotes=True,
fix_semicolon_mistakes=True):
if fix_semicolon_mistakes:
text = fix_semicolons(text)
word_tokenizer = nltk.tokenize.TreebankWordTokenizer()
# get rid of certain character so that we can use those for special purposes
tokens = word_tokenizer.tokenize(text)
if reattach:
tokens = reattach_clitics(tokens)
if split_off_quotes:
tokens = split_off_quote_marks(tokens)
if lemmatize:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
if replace_numbers:
tokens = [re.sub('[0-9]', '#', t) for t in tokens]
tokens = split_off_final_punctuation(tokens)
tokens = split_off_commas(tokens)
return tokens
开发者ID:dallascard,项目名称:guac,代码行数:28,代码来源:tokenizer.py
示例8: lemmatize
def lemmatize(self):
wnl = WordNetLemmatizer()
self.lemma_list = []
for i in self.tokens_no_punct:
lemmy_word = wnl.lemmatize(i)
self.lemma_list.append(unicode(lemmy_word))
开发者ID:mjlavin80,项目名称:walker,代码行数:7,代码来源:text_process.py
示例9: get_words_list
def get_words_list(dataset):
'''
Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
'''
# join the path and file name together
spam_path = 'data/enron/pre/'+ dataset + '/spam/'
ham_path = 'data/enron/pre/'+ dataset + '/ham/'
spam_npl = [i[-1] for i in os.walk(spam_path)][0]
ham_npl = [i[-1] for i in os.walk(ham_path)][0]
spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)
splitter = re.compile("\\W*")
english_stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# tokenize the files into words
spam_wl = [None]*len(spam_npl)
for i,f in enumerate(spam_fl):
spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
ham_wl = [None]*len(ham_npl)
for i,f in enumerate(ham_fl):
ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
return spam_wl, ham_wl
开发者ID:alifars,项目名称:training-classifier-in-python,代码行数:30,代码来源:svm_clf.py
示例10: preProcessHistogram
def preProcessHistogram(documents):
"""
preProcessHistogram(listofString) -> listOfString
consumes a listofSentences and Tokenizes it, and returns a list of lemmatized words.
"""
paragraph = ""
for sentence in documents:
paragraph = paragraph + " " + sentence.lower()
#make all words lowercase and remove all punctuation
lowerCaseParagraph = paragraph.translate(maketrans("",""),punctuation)
words = lowerCaseParagraph.split()
lemmatizer = WordNetLemmatizer()
#lemmatize every word.... (if it needs to be lemmatized) and remove words that are too long, because chances are they arent words.
words = map(lambda x: lemmatizer.lemmatize(x,'v'), words)
words = filter(lambda x: len(x) < 10 or x.isdigit() , words)
return words
开发者ID:t3abdulg,项目名称:Twitter-Topic-Modelling,代码行数:25,代码来源:textanalytics.py
示例11: text_tokenize
def text_tokenize(sentence):
#stemmer = SnowballStemmer('english')
lmtr = WordNetLemmatizer()
tokens = [x.lower() for x in word_tokenize(sentence) if x.isalpha()]
tokens_tagged = nltk.pos_tag(tokens)
tokens_tagged = [(x, get_wordnet_pos(y)) for (x, y) in tokens_tagged if x not in stopwords.words('english')]
return [lmtr.lemmatize(x, y) if y != '' else x for (x, y) in tokens_tagged]
开发者ID:chandlerzuo,项目名称:chandlerzuo.github.io,代码行数:7,代码来源:inaugural.py
示例12: getdata
def getdata():
"""
retrieves the data from repository table
removes the special characters and stop words and does the stemming(using nltk package)
"""
conn=db.getDBConnection()
cursor = conn.cursor()
global stopWordSet
sql = "select id, description from repository"
rows = db.executeSQL(conn, sql)
counter=1
wnl = WordNetLemmatizer()
for row in rows:
id = row[0]
desc= row[1]
#print desc
if desc is not None:
desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
desc = desc.lower()
desc = re.sub('[^a-z0-9 ]','',desc)
keywords = desc.split(" ")
for word in keywords:
#word = porter.stem(word.strip())
word=wnl.lemmatize(word.strip())
if word not in stopWordSet:
sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
print sql1
cursor.execute(sql1)
conn.commit()
counter = counter+1
开发者ID:rajuch,项目名称:Kmeans-Clustering,代码行数:30,代码来源:processdata.py
示例13: lemmatize
def lemmatize(tweets):
'''
Lemmatize words in the corpus.
Input:
------------------
tweets: List of lists, [[word1OfTweet1, word2OfTweet1,...,word_m1OfTweet1],
[word1OfTweet2, word2OfTweet2,...,word_m2OfTweet2],
.
.
.
[word1OfTweetN, word2OfTweetN,...,word_mNOfTweetN]]
Output:
-----------------
newTweets: All the words in the tweet lemmatized.
'''
wordnet_lemmatizer = WordNetLemmatizer()
pos_tag_tweets = [nltk.pos_tag(t) for t in tweets]
tweets = []
i = 0
for t in pos_tag_tweets:
tt = []
for w in t:
if get_wordnet_pos(w[1]) =='':
tt.append(w[0])
else:
try:
tt.append(wordnet_lemmatizer.lemmatize(w[0], pos = get_wordnet_pos(w[1])))
except UnicodeDecodeError:
pass
tweets.append(tt)
i += 1
return tweets
开发者ID:EliasJonsson,项目名称:PGM-Project,代码行数:33,代码来源:preprocess.py
示例14: _lemma_
def _lemma_(token):
if isinstance(token, str):
return _stem_(token)
if isinstance(token, unicode):
return _stem_(token)
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
p = get_wordnet_pos(token.pos()[0][1])
if p!=wordnet.VERB:
return _stem_(token[0])
rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
return rs
开发者ID:gkotsis,项目名称:negation-detection,代码行数:28,代码来源:negation_detection.py
示例15: GetCleanWords
def GetCleanWords(content_string):
# Tokenize the sentences using hte Punkt word Tokenizer
tokenized_words = PunktWordTokenizer().tokenize(content_string)
#Now let's remove the stop words
tokenized_words = [word for word in tokenized_words if word.lower() not in stopwords_list]
# Now let's remove all of the solely punctuation.
punctuation_list = ['.',',',';',':','!','?']
tokenized_words = [word for word in tokenized_words if word not in punctuation_list]
# Finally let's get rid of the punctuation at the end of each word
cleaned_words = []
for word in tokenized_words:
if word[-1] in punctuation_list:
cleaned_words.append(word[:-1])
else:
cleaned_words.append(word)
# Now let's stem each of the words to lower our word count
wnl = WordNetLemmatizer()
clean_and_stemmed_words = [wnl.lemmatize(cleaned_word) for cleaned_word in cleaned_words]
return clean_and_stemmed_words
开发者ID:jellis505,项目名称:NLPtools,代码行数:25,代码来源:CreateDict.py
示例16: possibility
def possibility():
wnl = WordNetLemmatizer()
verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))])
noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
article = "a"
if noun[0] in ["a", "e", "i", "o", "u"]:
article = "an"
if random.randrange(0, 100) < chance_quantity:
quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))]
if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous":
noun += "s"
possibility = verb + " " + quantity_word + " of the " + noun
elif random.randrange(0, 100) < chance_location:
location_word = location_adverbs[random.randrange(0, len(location_adverbs))]
possibility = (
verb
+ " "
+ article
+ " "
+ noun
+ " "
+ location_word
+ " the "
+ wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
)
else:
possibility = verb + " " + article + " " + noun
return possibility
开发者ID:jeffThompson,项目名称:WouldYouRatherBot,代码行数:33,代码来源:WouldYouRatherBot.py
示例17: lemmatize_tweets
def lemmatize_tweets(input_path, output_path):
wordnet_lemmatizer = WordNetLemmatizer()
input_files = glob.glob("%s/dataset_*.out" % input_path)
for input_file in input_files:
results = re.search('(dataset_.+)\.out', input_file)
filename = results.groups()[0]
output_file = "%s/%s_converted.out" % (output_path, filename)
output_file2 = "%s/%s_converted.out.id" % (output_path, filename)
with codecs.open(output_file, encoding='utf-8', mode='w') as out:
with codecs.open(output_file2, encoding='utf-8', mode='w') as out2:
print >>out, "<doc>"
with codecs.open(input_file, encoding='utf-8', mode='r') as f:
for line in f:
if re.search('TWEETID(\d+)START', line):
results = re.match('TWEETID(\d+)START', line)
groups = results.groups()
print >>out, "<p>"
print >>out2, "ID=%d" %(int(groups[0]))
elif re.search('TWEETID(\d+)END', line):
print >>out, "<\p>"
elif re.search("(.+)\t(.+)\t(.+)\n", line):
results = re.match("(.+)\t(.+)\t(.+)\n", line)
groups = results.groups()
word = groups[0]
pos = groups[1]
lemma = wordnet_lemmatizer.lemmatize(word)
print >>out, "%s\t%s\t%s" %(word, pos, lemma)
print >>out2, "%s\t%s\t%s" %(word, pos, lemma)
print >>out, "</doc>"
开发者ID:hds-lab,项目名称:coding-ml,代码行数:35,代码来源:tasks.py
示例18: preprocess_wikidata
def preprocess_wikidata(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower().split('../img/')[0]
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
开发者ID:DailyActie,项目名称:AI_APP_CV-TextTopicNet,代码行数:31,代码来源:preprocess_text.py
示例19: preprocess_text
def preprocess_text(raw_text):
"""
文本预处理操作
参数:
- raw_text 原始文本
返回:
- proc_text 处理后的文本
"""
# 全部转换为小写
raw_text = raw_text.lower()
# 1. 使用正则表达式去除标点符号
filter_pattern = re.compile('[%s]' % re.escape(string.punctuation))
words_only = filter_pattern.sub('', raw_text)
# 2. 分词
raw_words = nltk.word_tokenize(words_only)
# 3. 词形归一化
wordnet_lematizer = WordNetLemmatizer()
words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
# 4. 去除停用词
filtered_words = [word for word in words if word not in stopwords.words('english')]
proc_text = ' '.join(filtered_words)
return proc_text
开发者ID:ustbxyls,项目名称:GitRepo,代码行数:28,代码来源:utils.py
示例20: get_clean_text
def get_clean_text(list_filenames, path_to_file):
'''
parameter:
----------
list_filenames: as LST is a list of filename as STR
path_to_file: as STR is the path to the file containing movie scripts
--> such that path_to_file/filename.txt is the file to open
returns:
--------
list of list of words (lemmatize, lowercase) in the text (order preserved)
'''
wnl = WordNetLemmatizer()
list_texts_as_words = []
for filename in list_filenames:
path_file = path_to_file+"/"+filename+".txt"
with open(path_file) as f:
text = f.readlines()
lines = [line.strip() for line in text if line.strip()]
string_words = []
for line in lines:
words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())]
string_words += words
list_texts_as_words.append(string_words)
return list_texts_as_words
开发者ID:AnnaVM,项目名称:Project_Plotline,代码行数:25,代码来源:emotions_script.py
注:本文中的nltk.stem.WordNetLemmatizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论