本文整理汇总了Python中pyspark.mllib.feature.HashingTF类的典型用法代码示例。如果您正苦于以下问题:Python HashingTF类的具体用法?Python HashingTF怎么用?Python HashingTF使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HashingTF类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main(sc):
stopset = set(stopwords.words('english'))
tweets = sc.textFile('hdfs:/adi/sample.txt')
words = tweets.map(lambda word: word.split(" "))
wordArr = []
for wArr in words.collect():
tempArr = []
for w in wArr:
if not w in stopset:
tempArr.append(w)
wordArr.append(tempArr)
# Open a file
# print wordArr
#tokens = sc.textFile("hdfs:/adi/tokens1.txt")
# Load documents (one per line).
documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
numDims = 100000
hashingTF = HashingTF(numDims)
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
tfidf.count()
model = KMeans.train(tfidf, 5)
model.save(sc,"tweetModel1")
print("Final centers: " + str(model.clusterCenters))
# print("Total Cost: " + str(model.computeCost(data)))
sc.stop()
开发者ID:aditcoding,项目名称:zfs,代码行数:31,代码来源:ml.py
示例2: TFIDF
def TFIDF(source, destination):
if destination[-1] != '/':
destination=destination+'/'
## typically define the source message
rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split())
tf=HashingTF()
tfVectors=tf.transform(rdd).cache()
a = tfVectors.collect()
# Storing the TF values above in individual files, one per link
ind = 0
for vector in a:
dest_path = destination + "TF_%d"%ind + ".txt"
ind = ind + 1
file = open(dest_path,'w')
file.write(str(vector))
file.close()
# Calculating IDF Values for each case.
idf=IDF()
idfModel=idf.fit(tfVectors)
tfIdfVectors=idfModel.transform(tfVectors)
# Writing TF-IDF values to a single file.
file = open(destination+"TF-IDF.txt", 'w')
file.write(str(tfIdfVectors.collect()))
try:
for i in range(0,100):
print ""#Testing Printing"
except KeyboardInterrupt:
pass
开发者ID:rikinmathur,项目名称:EECS-6895-FINAL-PROJECT,代码行数:28,代码来源:maanittf.py
示例3: get_feature_vectors
def get_feature_vectors(sc, input_file, feature_dimensions):
"""Get feature vector from the lines in input_file_obj using
TF/IDF.
Returns:
vectors RDD
"""
# Load documents (one per line).
tweet_file = sc.textFile(input_file)
input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
input_text_rdd.cache()
# The default feature dimension is 2^20; for a corpus with million
# tweets recommended dimensions are 50000 or 100000. Use higher
# dimensions for larger corpus of tweets.
hashing_tf = HashingTF(feature_dimensions)
tf = hashing_tf.transform(input_text_rdd)
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)
tfidf.cache()
return input_text_rdd, tfidf
开发者ID:rohithvsm,项目名称:spark_exercises,代码行数:25,代码来源:tweets_kmeans_classifier.py
示例4: main
def main():
# 初始化 SparkContext
sc = spark_context(spark_master)
# 读取文件
data = sc.textFile(hdfs_path)
# 分词
documents = data.map(tokenize)
documents.cache()
# TF
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
# IDF
idf = IDF(minDocFreq=2).fit(tf)
# TFIDF
tfidf = idf.transform(tf)
# 链接到 MongoDB
from pymongo import MongoClient
mongo_client = MongoClient(mongo_host)
mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1')
clear_mongodb(mongo_client)
# zip
term_tfidf = documents.zip(tfidf).map(doc_tfidf)
articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y)
for article in articles.collect():
item = {}
item['text'] = article[0].encode('utf-8')
item['size'] = int(article[1] * 10)
send_mongodb(mongo_client, item)
开发者ID:yankaics,项目名称:zhangxinyun-spark,代码行数:35,代码来源:tfidf.py
示例5: generatedHashedFeatures
def generatedHashedFeatures(tweet):
#get label from tweet
#get text from tweet
htf = HashingTF(50000)
lp = LabeledPoint("0", htf.transform(text))
return lp
开发者ID:LeotisBuchanan,项目名称:stream-data-analysis-realtime,代码行数:7,代码来源:trainNaiveandCreateNaiveBayesModel.py
示例6: tfidf
def tfidf(self):
self._create_rdd()
hashingTF = HashingTF()
tf = hashingTF.transform(self.token_rdd)
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)
return tfidf
开发者ID:nhu2000,项目名称:wiki-search,代码行数:7,代码来源:make_tfidf.py
示例7: get_tfidf_features
def get_tfidf_features(txt):
hashingTF = HashingTF()
tf = hashingTF.transform(txt)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
return tfidf
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:8,代码来源:amazon_review_tfidf_normalized.py
示例8: transform
def transform(idf, article):
"""
transform article to a sparse vector
"""
token = tokenizing(article)
hashingTF = HashingTF()
tf_test = hashingTF.transform(token)
return idf.transform(tf_test)
开发者ID:xiaoyubai,项目名称:wiki-search,代码行数:8,代码来源:model.py
示例9: tfidf
def tfidf(rdd_doc):
hasingTF = HashingTF()
trainTf = hasingTF.transform(rdd_doc)
trainTf.cache()
idf = IDF().fit(trainTf)
trainTfidf = idf.transform(trainTf)
trainTfidf.cache()
return trainTfidf, lambda x: hasingTF.indexOf(x)
开发者ID:hendrydong,项目名称:StackOverFlow_Analysis_PySpark,代码行数:8,代码来源:tfidf_v2.py
示例10: makeDict
def makeDict(x):
global data
hash = HashingTF(100000)
for i in range(len(x)):
word = x[i]
ind = hash.indexOf(word)
if ind not in data:
data.update({ind:word})
print len(data)
开发者ID:nashdb,项目名称:BigData-Yelp,代码行数:9,代码来源:tfidfYelp.py
示例11: tf_idf
def tf_idf(sc,title_token):
hashingTF = HashingTF(100)
title_token = sc.parallelize(title_token)
tf = hashingTF.transform(title_token)
print tf, ' tf'
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
return tfidf
开发者ID:IcedNecro,项目名称:AWO-61-backend,代码行数:10,代码来源:service_func.py
示例12: tfidf
def tfidf(self, tokenizer):
"""
Get TFIDF matrix rdd with spark tfidf functions
"""
self._create_rdd(tokenizer)
hashingTF = HashingTF()
tf = hashingTF.transform(self.token_rdd)
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)
return self.rdd, idf, tfidf
开发者ID:xiaoyubai,项目名称:wiki-search,代码行数:10,代码来源:model.py
示例13: vectorize
def vectorize(sc, rdd_words, size=0):
'''
使用TF将词语向量化
向量的维度需要设定的,默认为2^20
'''
if not size:
size = rdd_words.flatMap(lambda x:x).distinct().count() + 10000
hashingTF = HashingTF(size)
tf = hashingTF.transform(rdd_words)
return tf
开发者ID:2221758805,项目名称:SparkDemo,代码行数:10,代码来源:demo_vectorize.py
示例14: main
def main():
"""
Driver program for a spam filter using Spark and MLLib
"""
# Consolidate the individual email files into a single spam file
# and a single ham file
makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )
# Create the Spark Context for parallel processing
sc = SparkContext( appName="Spam Filter")
# Load the spam and ham data files into RDDs
spam = sc.textFile( "data/spam.txt" )
ham = sc.textFile( "data/ham.txt" )
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
# Combine positive and negative datasets into one
data = positiveExamples.union(negativeExamples)
# Split the data into 70% for training and 30% test data sets
( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )
# Cache the training data to optmize the Logistic Regression
trainingData.cache()
# Train the model with Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Create tuples of actual and predicted values
labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )
# Calculate the error rate as number wrong / total number
error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
print( "*********** SPAM FILTER RESULTS **********" )
print( "\n" )
print( "Error Rate: " + str( error_rate ) )
print( "\n" )
# Serialize the model for presistance
pickle.dump( model, open( "spamFilter.pkl", "wb" ) )
sc.stop()
开发者ID:badpaper,项目名称:coursework,代码行数:54,代码来源:spamFilter.py
示例15: tf_idf_cal
def tf_idf_cal(words_rdd):
hashingTF = HashingTF()
tf = hashingTF.transform(words_rdd)
idf = IDF().fit(tf)
tfidf = idf.transform(tf).cache()
tfidf_str = tfidf.map(lambda line: str(line)).cache()
return tfidf_str
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:11,代码来源:tf_idf_amazon.py
示例16: test_binary_term_freqs
def test_binary_term_freqs(self):
hashingTF = HashingTF(100).setBinary(True)
doc = "a a b c c c".split(" ")
n = hashingTF.numFeatures
output = hashingTF.transform(doc).toArray()
expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
hashingTF.indexOf("b"): 1.0,
hashingTF.indexOf("c"): 1.0}).toArray()
for i in range(0, n):
self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
": expected " + str(expected[i]) + ", got " + str(output[i]))
开发者ID:Brett-A,项目名称:spark,代码行数:11,代码来源:test_feature.py
示例17: generate_gender_tf
def generate_gender_tf(twProfilesRdd,numFe):
"""
Generate Term Frequency tuple (gender,sparse vector) from rdd containing following tuples:
(gender,(clean words tuple))
"""
tf = HashingTF(numFeatures = numFe)
return twProfilesRdd.map(lambda genderDescrTuple: (genderDict[genderDescrTuple[0]],tf.transform(genderDescrTuple[1])))
开发者ID:Fiware,项目名称:incubated.Social-data-aggregator,代码行数:7,代码来源:gra_usr_descr.py
示例18: process
def process(self, newsRDD):
hashingTF = HashingTF(self.n)
self.newsRDD = newsRDD
self.featuresRDD = newsRDD.map(lambda x: FeaturesV2(x))
#toto = self.featuresRDD.take(1)[0]
#print(toto.words + toto.bg2 + toto.bg3)
self.labeledPointsRdd = self.featuresRDD.map(lambda x: LabeledPoint(x.giveClasseN(1), hashingTF.transform(x.words + x.bg2 + x.bg3)))
try:
nbVeryPos = self.featuresRDD.filter(lambda x: x.giveClasseN(1) == 3).count()
nbPos = self.featuresRDD.filter(lambda x: x.giveClasseN(1) == 2).count()
nbNeg = self.featuresRDD.filter(lambda x: x.giveClasseN(1) == 1).count()
nbVeryNeg = self.featuresRDD.filter(lambda x: x.giveClasseN(1) == 0).count()
nbTot = self.featuresRDD.count()
print("nbTot %d" % nbTot)
print("\tnbVeryNeg %d" % nbVeryNeg)
print("\tnbNeg %d" % nbNeg)
print("\tnbPos %d" % nbPos)
print("\tnbVeryPos %d" % nbVeryPos)
except:
pass # empty rdd
return self.labeledPointsRdd
开发者ID:sh19871122,项目名称:TM_2014-2015S2,代码行数:27,代码来源:UseFeaturesv2.py
示例19: processKeepNews
def processKeepNews(self, newsRDD):
hashingTF = HashingTF(self.n)
self.newsRDD = newsRDD
self.featuresRDD = newsRDD.map(lambda x: FeaturesV2(x))
self.labeledPointsRdd = self.featuresRDD.map(lambda x: (x.news, LabeledPoint(x.giveClasseN(1), hashingTF.transform(x.words + x.bg2 + x.bg3))))
return self.labeledPointsRdd
开发者ID:sh19871122,项目名称:TM_2014-2015S2,代码行数:8,代码来源:UseFeaturesv2.py
示例20: mySpark
def mySpark(minFreq, keyWord):
# text cleaning function
def removePunctuation(text):
res=text.lower().strip()
res=re.sub("[^0-9a-zA-Z ]", "", res)
return res.split(" ")
# Function for printing each element in RDD
def println(x):
for i in x:
print i
# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)
# Load documents content (one per line) + cleaning.
rawData = sc.textFile("list_berita-30.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: removePunctuation(x[3]))
# Get documents content without word mapping
documentNames = fields.map(lambda x: x[3])
# TF processing
hashingTF = HashingTF(100000) #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)
# IDF & TF-IDF processing
tf.cache()
idf = IDF(minDocFreq=int(minFreq)).fit(tf)
tfidf = idf.transform(tf)
# Get keyword relevance with content and zip it
keywordTF = hashingTF.transform(removePunctuation(keyWord))
keywordHashValue = int(keywordTF.indices[0])
keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
zippedResults = keywordRelevance.zip(documentNames)
# print result
print "Best document for keywords is:"
print zippedResults.max()
开发者ID:arsoedjono,项目名称:big-data,代码行数:43,代码来源:tfidf_mod.py
注:本文中的pyspark.mllib.feature.HashingTF类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论