本文整理汇总了Python中pyspark.ml.feature.IDF类的典型用法代码示例。如果您正苦于以下问题:Python IDF类的具体用法?Python IDF怎么用?Python IDF使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了IDF类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: fit_kmeans
def fit_kmeans(spark, products_df):
step = 0
step += 1
tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")
step += 1
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")
step += 1
tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)
step += 1
idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")
step += 1
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")
step += 1
kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)
kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])
model = kmeans_pipeline.fit(products_df)
words_prediction = model.transform(products_df)
model.save("./kmeans") # the whole machine learning instance is saved in a folder
return model, words_prediction
开发者ID:ohliumliu,项目名称:flash_deals_c9,代码行数:27,代码来源:kmean_model.py
示例2: tf_idf_feature
def tf_idf_feature(wordsData):
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "id").take(3):
print(features_label)
开发者ID:wingsrc,项目名称:benchmark_minhash_lsh,代码行数:8,代码来源:preprocessing.py
示例3: textPredict
def textPredict(request):
"""6.文本聚类,热度预测"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""决策树模型培训"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型测试"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用户数据测试,单个新闻测试"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型评估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:58,代码来源:views.py
示例4: extract_idf_features
def extract_idf_features(p_df, input_col, output_col):
"""
Extracts IDF features.
:param p_df: A DataFrame.
:param in_column: Name of the input column.
:param out_column: Name of the output column.
:return: A DataFrame.
"""
idf = IDF(inputCol=input_col, outputCol=output_col)
idfModel = idf.fit(p_df)
return idfModel.transform(p_df)
开发者ID:rhasan,项目名称:machine-learning,代码行数:11,代码来源:Quora.py
示例5: run_tf_idf_spark_ml
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
return idfModel.transform(featurizedData)
开发者ID:ctavan,项目名称:bbuzz2016,代码行数:11,代码来源:bbuzz2016-backup.py
示例6: tfidf
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):
global idfModel
hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
featurizedData = hashingTF.transform(dataframe)
idf = IDF(inputCol=in_col2, outputCol=out_col2)
idfModel = idf.fit(featurizedData)
dataframe = idfModel.transform(featurizedData)
return dataframe
开发者ID:rjshanahan,项目名称:Text_Analytics_Topic_Modelling,代码行数:11,代码来源:topic_modelling_scikit.py
示例7: tf_feature_vectorizer
def tf_feature_vectorizer(df,no_of_features,ip_col):
#from pyspark.sql.functions import udf
#from pyspark.sql.types import *
output_raw_col = ip_col+"raw_features"
output_col = ip_col+"features"
hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
featurizedData = hashingTF.transform(df)
idf = IDF(inputCol=output_raw_col, outputCol=output_col)
idfModel = idf.fit(featurizedData)
rescaled_data = idfModel.transform(featurizedData)
rescaled_data.show(5)
print(rescaled_data.count())
return rescaled_data
开发者ID:vikaasa,项目名称:Spark_Workshop,代码行数:13,代码来源:sparking_your_interest.py
示例8: create_features
def create_features(raw_data):
#Create DataFrame
data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
#Transform sentence into words
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_df = tokenizer.transform(data_df)
#Calculate term frequency
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
featurized_df = hashingTF.transform(words_df)
#Calculate inverse document frequency
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurized_df)
return idfModel.transform(featurized_df)
开发者ID:DataLAUSDEclassProject,项目名称:spark,代码行数:13,代码来源:spark_cluster.py
示例9: test_idf
def test_idf(self):
dataset = self.spark.createDataFrame([
(DenseVector([1.0, 2.0]),),
(DenseVector([0.0, 1.0]),),
(DenseVector([3.0, 0.2]),)], ["tf"])
idf0 = IDF(inputCol="tf")
self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol])
idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
self.assertEqual(idf0m.uid, idf0.uid,
"Model should inherit the UID from its parent estimator.")
output = idf0m.transform(dataset)
self.assertIsNotNone(output.head().idf)
# Test that parameters transferred to Python Model
check_params(self, idf0m)
开发者ID:JingchengDu,项目名称:spark,代码行数:14,代码来源:test_feature.py
示例10: makeTFIDF
def makeTFIDF(sc, spark, reviews):
# count vectorizer and tfidf
# cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
# cvModel = cv.fit(reviews)
# reviews = cvModel.transform(reviews)
# HashingTF for fewer dimensions:
hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
reviews = hashingtf.transform(reviews)
# create TF-IDF matrix
idf = IDF().setInputCol('tf').setOutputCol('tfidf')
tfidfModel = idf.fit(reviews)
reviews = tfidfModel.transform(reviews)
开发者ID:sam46,项目名称:Yelper,代码行数:14,代码来源:project.py
示例11: tf_idf
def tf_idf(df, column):
"""
Compute TF-IDF of a corpus.
Transformation: array<string> --> vector
"""
df = preprocess(df, column) # text to list of terms
(df, voc) = count(df, column)
# creates a TF-IDF model and uses it to compute the feature vector.
idf = IDF(inputCol=column, outputCol='_'+column)
model = idf.fit(df)
df = model.transform(df)
df = replace(df, column, '_'+column)
return (df, voc)
开发者ID:ribonj,项目名称:lsir,代码行数:15,代码来源:ml.py
示例12: append_tf_idf
def append_tf_idf(self, df):
"""
Calculate term frequency and inverse document frequency
based on at least 1 visit hourly in this case. Compares how often the tokens appeared
at least once per hour compared to other tokens. Not used for the main purpose of the project.
Args:
:param df: Dataframe parameter.
Returns:
:return: Dataframe with term frequency and inverse document frequency added in the columns
'rawFeatures' and 'features' respectively.
"""
#Create TF column.
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
tf = hashingTF.transform(df)
tf.persist(StorageLevel.MEMORY_AND_DISK)
#Create IDF column.
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf = idfModel.transform(tf)
return tfidf
开发者ID:ari99,项目名称:wiki_stats,代码行数:20,代码来源:operations.py
示例13: get_top_words
def get_top_words(dataset, signatures):
# TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
# Or translate comments in other languages using the free Microsoft Translate API.
sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))
if sentenceData.rdd.isEmpty():
return dict()
# Tokenize comments.
tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
# Remove duplicate words from comments.
wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])
if wordsData.rdd.isEmpty():
print("[WARNING]: wordsData is empty, sentenceData wasn't.")
return dict()
# Clean comment words by removing puntuaction and stemming.
def clean_word(w):
return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))
wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])
# XXX: Useless with TF-IDF?
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
cleanWordsData = remover.transform(wordsData)
cv = CountVectorizer(inputCol='filtered', outputCol='features')
model = cv.fit(cleanWordsData)
featurizedData = model.transform(cleanWordsData)
idf = IDF(inputCol='features', outputCol='tfidf_features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()
return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
开发者ID:marco-c,项目名称:crashcorrelations,代码行数:40,代码来源:comments.py
示例14: SparkContext
#print(data.head(5))
##creating rdd file
sc = SparkContext("local", "app")
sqc = SQLContext(sc)
df = sqc.createDataFrame(data, ['type', 'text'])
#NEW VARIABLE GENERATION
dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text'])))
dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1]))
dfClean = sqc.createDataFrame(dataClean, ['label', 'words'])
dfClean.show(5)
hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000)
tf = hashingTF.transform(dfClean)
idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf)
dfFinal = idf.transform(tf)
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal)
# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])
# Train the model.
#rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")
开发者ID:LJohnnes,项目名称:nlpmlsms,代码行数:31,代码来源:sms_spam_filtering_scalable.py
示例15: BeautifulSoup
review_text = BeautifulSoup(raw_review).text
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 5. Join the words back into one string separated by space,
# and return the result.
return " ".join( meaningful_words)
stops = set(stopwords.words("english"))
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]),
review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
开发者ID:rbkasat,项目名称:CSYE7374_FinalProject,代码行数:31,代码来源:RandomForest_TF-IDF.py
示例16: SQLContext
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:29,代码来源:script1.py
示例17: RegexTokenizer
from pyspark.ml.feature import RegexTokenizer
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")
# COMMAND ----------
# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.
# COMMAND ----------
from pyspark.ml.feature import IDF, HashingTF, Normalizer
hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")
idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")
normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")
# COMMAND ----------
# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run.
# COMMAND ----------
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
开发者ID:Inscrutive,项目名称:spark,代码行数:30,代码来源:TFIDF.py
示例18: main
def main(sc, sqlContext):
start = timer()
stpwrds = stopwords.words('english')
tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))
print '---Pegando produtos---'
start_i = timer()
productRDD = sc.parallelize(findProductsByCategory([]))
print '####levou %d segundos' % (timer()-start_i)
print '---Criando corpus---'
start_i = timer()
corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
.map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
.cache())
print '####levou %d segundos' % (timer()-start_i)
print '---Pegando e persistindo dados de categoria e tokens---'
start_i = timer()
tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
numTokens = len(tokens)
category = productRDD.map(lambda x: x[2]).distinct().collect()
categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
insertTokensAndCategories(tokens, category, categoryAndSubcategory)
print '####levou %d segundos' % (timer()-start_i)
print '---Calculando TF-IDF dos produtos---'
start_i = timer()
wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
#persistir isso para que ele nao tenha que fazer de novo na predicaoo
wordsDataDF = sqlContext.createDataFrame(wordsData)
#persistindo para a predicao
wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
#persistir isso para que ele nao tenha que fazer de novo na predicaoo
wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)
if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")
wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
idf = IDF(inputCol="rawFeatures", outputCol="features")
featurizedData = hashingTF.transform(wordsDataDF)
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))
VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
print '####levou %d segundos' % (timer()-start_i)
print '--Criando modelo Naive Bayes---'
start_i = timer()
model = NaiveBayes.train(VSMTrain)
if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")
model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
print '####levou %d segundos' % (timer()-start_i)
print '---Testando modelo Naive Bayes---'
start_i = timer()
prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
print 'acuracidade de %f' % acuraccy
print '####levou %d segundos' % (timer()-start_i)
print '---Pegando os posts---'
start_i = timer()
posts = list()
wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
sheet = wb['Menes']
for row in sheet.iter_rows(row_offset=1):
post = list()
for cell in row:
if cell.value is None:
break
post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))
if len(post) > 0:
posts.append(tuple(post))
print '####levou %d segundos' % (timer()-start_i)
print '---Criando corpus---'
start_i = timer()
postsRDD = sc.parallelize(posts)
postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
.map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
.cache())
#.........这里部分代码省略.........
开发者ID:felipecontra3,项目名称:recsys-tcc-ml,代码行数:101,代码来源:train_classifier.py
示例19: trainModel
def trainModel(self):
logger.info("Training the model...")
query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''
def SQLtoURL(query):
data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ')
return data
def QueryXXXXX(query, file = None):
session = Session()
response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
return response.content
table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
title_list = [x['c'] for x in table['rows']]
table_cols = [d['label'] for d in table['cols']]
def convert_row(row):
rowlist = [d['v'] for d in row]
return rowlist
rd = self.sc.parallelize(title_list).map(convert_row)
titleData = self.spark.createDataFrame(rd, table_cols)
titleData = titleData.dropna()
hebrew_stopwords = stop_words()
def rmv(words):
for punc in punctuation:
words = words.replace(punc,"")
for hword in hebrew_stopwords:
words = words.replace(hword, " ")
return words
self.spark.udf.register("rmv", rmv, StringType())
titleData.registerTempTable("wordstable")
cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
wordsData = tokenizer.transform(cleanedSentenceData)
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
cvModel = cv.fit(wordsData)
featurizedData = cvModel.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
lda = LDA(k=100)
ldaModel = lda.fit(rescaledData)
postFactorizedData = ldaModel.transform(rescaledData)
norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
self.model = scaledFactorizedNormalizedData
logger.info("model is built!")
开发者ID:NoamRosenberg,项目名称:Portfolio,代码行数:61,代码来源:engine.py
示例20: Tokenizer
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("tf_idf_sample") \
.master("local[*]") \
.getOrCreate()
df1 = spark.createDataFrame([
(0, "a a a b b c"),
(0, "a b c"),
(1, "a c a a d")]).toDF("label", "sentence")
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20)
df3 = hashingTF.transform(df2)
df3.cache()
idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)
rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()
spark.stop
开发者ID:oopchoi,项目名称:spark,代码行数:30,代码来源:tf_idf_sample.py
注:本文中的pyspark.ml.feature.IDF类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论