本文整理汇总了Python中spambayes.tokenizer.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenize函数的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_dbm_export
def test_dbm_export(self):
# Create a dbm classifier to export.
bayes = DBDictClassifier(TEMP_DBM_NAME)
# Stuff some messages in it so it's not empty.
bayes.learn(tokenize(spam1), True)
bayes.learn(tokenize(good1), False)
# Save & Close.
bayes.store()
bayes.close()
# Export.
sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
# Reopen the original.
bayes = open_storage(TEMP_DBM_NAME, "dbm")
# Verify that the CSV holds all the original data (and, by using
# the CSV module to open it, that it is valid CSV data).
fp = open(TEMP_CSV_NAME, "rb")
reader = sb_dbexpimp.csv.reader(fp)
(nham, nspam) = reader.next()
self.assertEqual(int(nham), bayes.nham)
self.assertEqual(int(nspam), bayes.nspam)
for (word, hamcount, spamcount) in reader:
word = sb_dbexpimp.uunquote(word)
self.assert_(word in bayes._wordinfokeys())
wi = bayes._wordinfoget(word)
self.assertEqual(int(hamcount), wi.hamcount)
self.assertEqual(int(spamcount), wi.spamcount)
开发者ID:bloggse,项目名称:spambayes-lite,代码行数:26,代码来源:test_sb_dbexpimp.py
示例2: _update
def _update(self, folders, is_spam):
changed = False
for f in folders:
log("update from %s" % f.path)
added, removed = f.read()
if added:
log("added %d" % len(added))
if removed:
log("removed %d" % len(removed))
get_transaction().commit()
if not (added or removed):
continue
changed = True
# It's important not to commit a transaction until
# after update_probabilities is called in update().
# Otherwise some new entries will cause scoring to fail.
for msg in added.keys():
self.classifier.learn(tokenize(msg), is_spam)
del added
get_transaction().commit(1)
log("learned")
for msg in removed.keys():
self.classifier.unlearn(tokenize(msg), is_spam)
if removed:
log("unlearned")
del removed
get_transaction().commit(1)
return changed
开发者ID:Xodarap,项目名称:Eipi,代码行数:29,代码来源:profile.py
示例3: mapmessages
def mapmessages(f, mboxtype, mapdb):
i = 0
for msg in getmbox(f):
i += 1
sys.stdout.write('\r%s: %d' % (f, i))
sys.stdout.flush()
msgid = msg.get("message-id")
if msgid is None:
continue
for t in tokenize(msg):
ham, spam = mapdb.get(t, ({}, {}))
if mboxtype == "ham":
msgids = ham.get(f, set())
msgids.add(msgid)
ham[f] = msgids
else:
msgids = spam.get(f, set())
msgids.add(msgid)
spam[f] = msgids
mapdb[t] = (ham, spam)
if options["Classifier", "x-use_bigrams"]:
for t in Classifier()._enhance_wordstream(tokenize(msg)):
ham, spam = mapdb.get(t, ({}, {}))
if mboxtype == "ham":
msgids = ham.get(f, set())
msgids.add(msgid)
ham[f] = msgids
else:
msgids = spam.get(f, set())
msgids.add(msgid)
spam[f] = msgids
mapdb[t] = (ham, spam)
sys.stdout.write("\n")
开发者ID:ehuelsmann,项目名称:spambayes,代码行数:33,代码来源:mkreversemap.py
示例4: train_message
def train_message(msg, is_spam, cdata):
# Train an individual message.
# Returns True if newly added (message will be correctly
# untrained if it was in the wrong category), False if already
# in the correct category. Catch your own damn exceptions.
# If re-classified AND rescore = True, then a new score will
# be written to the message (so the user can see some effects)
from spambayes.tokenizer import tokenize
if not cdata.message_db.has_key(msg.searchkey):
was_spam = None
else:
was_spam = cdata.message_db[msg.searchkey]=='1'
if was_spam == is_spam:
return False # already correctly classified
# Brand new (was_spam is None), or incorrectly classified.
stream = msg.GetEmailPackageObject()
if was_spam is not None:
# The classification has changed; unlearn the old classification.
cdata.bayes.unlearn(tokenize(stream), was_spam)
# Learn the correct classification.
cdata.bayes.learn(tokenize(stream), is_spam)
cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
cdata.dirty = True
return True
开发者ID:Xodarap,项目名称:Eipi,代码行数:27,代码来源:train.py
示例5: runUIAndProxy
def runUIAndProxy():
httpServer = UserInterfaceServer(8881)
proxyUI = ProxyUserInterface(state, _recreateState)
httpServer.register(proxyUI)
BayesProxyListener('localhost', 8110, ('', 8111))
state.bayes.learn(tokenizer.tokenize(spam1), True)
state.bayes.learn(tokenizer.tokenize(good1), False)
proxyReady.set()
Dibbler.run()
开发者ID:ArildF,项目名称:rogie,代码行数:9,代码来源:test_sb-server.py
示例6: test_untrain_spam
def test_untrain_spam(self):
self.h.open('c')
# Put a message in the classifier to be removed.
self.h.h.bayes.learn(tokenize(spam1), True)
# Verify that the classifier gets untrained with the message.
self.h.untrain_spam(spam1)
self.assertEqual(self.h.h.bayes.nham, 0)
self.assertEqual(self.h.h.bayes.nspam, 0)
for token in tokenize(spam1):
wi = self.h.h.bayes._wordinfoget(token)
self.assertEqual(wi, None)
开发者ID:Xodarap,项目名称:Eipi,代码行数:11,代码来源:test_sb_filter.py
示例7: test_filter
def test_filter(self):
# Verify that the msg has the classification header added.
self.h.open('c')
self.h.h.bayes.learn(tokenize(good1), False)
self.h.h.bayes.learn(tokenize(spam1), True)
self.h.h.store()
result = email.message_from_string(self.h.filter(spam1))
self.assert_(result[options["Headers",
"classification_header_name"]].\
startswith(options["Headers", "header_spam_string"]))
result = email.message_from_string(self.h.filter(good1))
self.assert_(result[options["Headers",
"classification_header_name"]].\
startswith(options["Headers", "header_ham_string"]))
开发者ID:Xodarap,项目名称:Eipi,代码行数:14,代码来源:test_sb_filter.py
示例8: print_message_score
def print_message_score(msg_name, msg_fp):
msg = email.message_from_file(msg_fp)
bayes = CdbClassifier(open(DB_FILE, 'rb'))
prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
print msg_name, prob
for word, prob in evidence:
print ' ', repr(word), prob
开发者ID:ehuelsmann,项目名称:spambayes,代码行数:7,代码来源:sb_mailsort.py
示例9: test_merge_to_dbm
def test_merge_to_dbm(self):
# Create a dbm classifier to merge with.
bayes = DBDictClassifier(TEMP_DBM_NAME)
# Stuff some messages in it so it's not empty.
bayes.learn(tokenize(spam1), True)
bayes.learn(tokenize(good1), False)
# Save data to check against.
original_nham = bayes.nham
original_nspam = bayes.nspam
original_data = {}
for key in bayes._wordinfokeys():
original_data[key] = bayes._wordinfoget(key)
# Save & Close.
bayes.store()
bayes.close()
# Create a CSV file to import.
nham, nspam = 3,4
temp = open(TEMP_CSV_NAME, "wb")
temp.write("%d,%d\n" % (nham, nspam))
csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
"of":(1,0), "the":(1,2), "import":(3,1)}
for word, (ham, spam) in csv_data.items():
temp.write("%s,%s,%s\n" % (word, ham, spam))
temp.close()
sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
# Open the converted file and verify that it has all the data from
# the CSV file (and by opening it, that it is a valid dbm file),
# and the data from the original dbm database.
bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
self.assertEqual(bayes2.nham, nham + original_nham)
self.assertEqual(bayes2.nspam, nspam + original_nspam)
words = original_data.keys()[:]
words.extend(csv_data.keys())
for word in words:
word = sb_dbexpimp.uquote(word)
self.assert_(word in bayes2._wordinfokeys())
h, s = csv_data.get(word, (0,0))
wi = original_data.get(word, None)
if wi:
h += wi.hamcount
s += wi.spamcount
wi2 = bayes2._wordinfoget(word)
self.assertEqual(h, wi2.hamcount)
self.assertEqual(s, wi2.spamcount)
开发者ID:bloggse,项目名称:spambayes-lite,代码行数:44,代码来源:test_sb_dbexpimp.py
示例10: test_train_spam
def test_train_spam(self):
# Verify that the classifier gets trained with the message.
self.h.open('c')
self.h.train_spam(spam1)
self.assertEqual(self.h.h.bayes.nham, 0)
self.assertEqual(self.h.h.bayes.nspam, 1)
for token in tokenize(spam1):
wi = self.h.h.bayes._wordinfoget(token)
self.assertEqual(wi.hamcount, 0)
self.assertEqual(wi.spamcount, 1)
开发者ID:Xodarap,项目名称:Eipi,代码行数:10,代码来源:test_sb_filter.py
注:本文中的spambayes.tokenizer.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论