本文整理汇总了Python中pyspark.ml.feature.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: fit_kmeans
def fit_kmeans(spark, products_df):
step = 0
step += 1
tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")
step += 1
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")
step += 1
tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)
step += 1
idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")
step += 1
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")
step += 1
kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)
kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])
model = kmeans_pipeline.fit(products_df)
words_prediction = model.transform(products_df)
model.save("./kmeans") # the whole machine learning instance is saved in a folder
return model, words_prediction
开发者ID:ohliumliu,项目名称:flash_deals_c9,代码行数:27,代码来源:kmean_model.py
示例2: token
def token(dataframe, in_col, out_col):
tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col)
dataframe = tokenizer.transform(dataframe)
dataframe.printSchema()
return dataframe
开发者ID:rjshanahan,项目名称:Text_Analytics_Topic_Modelling,代码行数:8,代码来源:topic_modelling_scikit.py
示例3: textPredict
def textPredict(request):
"""6.文本聚类,热度预测"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""决策树模型培训"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型测试"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用户数据测试,单个新闻测试"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型评估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:58,代码来源:views.py
示例4: main
def main():
spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()
args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
'S3_INPUT_KEY_PREFIX',
'S3_OUTPUT_BUCKET',
'S3_OUTPUT_KEY_PREFIX',
'S3_MODEL_BUCKET',
'S3_MODEL_KEY_PREFIX'])
# This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
"org.apache.hadoop.mapred.FileOutputCommitter")
# Defining the schema corresponding to the input data. The input data does not contain the headers
schema = StructType([StructField("label", IntegerType(), True),
StructField("title", StringType(), True),
StructField("abstract", StringType(), True)])
# Download the data from S3 into two separate Dataframes
traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
'train.csv')), header=False, schema=schema, encoding='UTF-8')
validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
'test.csv')), header=False, schema=schema, encoding='UTF-8')
# Tokenize the abstract column which contains the input text
tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")
# Save transformed training data to CSV in S3 by converting to RDD.
transformed_traindf = tokenizer.transform(traindf)
transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
lines = transformed_train_rdd.map(csv_line)
lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))
# Similar data processing for validation dataset.
transformed_validation = tokenizer.transform(validationdf)
transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
lines = transformed_validation_rdd.map(csv_line)
lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))
# Serialize the tokenizer via MLeap and upload to S3
SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)
# Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
import zipfile
with zipfile.ZipFile("/tmp/model.zip") as zf:
zf.extractall("/tmp/model")
# Write back the content as a .tar.gz file
import tarfile
with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
tar.add("/tmp/model/bundle.json", arcname='bundle.json')
tar.add("/tmp/model/root", arcname='root')
s3 = boto3.resource('s3')
file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
开发者ID:FNDaily,项目名称:amazon-sagemaker-examples,代码行数:57,代码来源:dbpedia_processing.py
示例5: run_tf_idf_spark_ml
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
return idfModel.transform(featurizedData)
开发者ID:ctavan,项目名称:bbuzz2016,代码行数:11,代码来源:bbuzz2016-backup.py
示例6: predictLabel
def predictLabel(label,title,model):
"""预测新闻的标签"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
return myprediction
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:12,代码来源:desionTree.py
示例7: create_features
def create_features(raw_data):
#Create DataFrame
data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
#Transform sentence into words
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_df = tokenizer.transform(data_df)
#Calculate term frequency
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
featurized_df = hashingTF.transform(words_df)
#Calculate inverse document frequency
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurized_df)
return idfModel.transform(featurized_df)
开发者ID:DataLAUSDEclassProject,项目名称:spark,代码行数:13,代码来源:spark_cluster.py
示例8: _build_stages
def _build_stages(self):
self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
self.lr = LogisticRegression(maxIter=10, regParam=0.01)
return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
开发者ID:ngarneau,项目名称:sentiment-analysis,代码行数:7,代码来源:pipelines.py
示例9: preprocessing_titles
def preprocessing_titles(path,name):
query = preprocessData(path)
tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
wordsData = tokenizer.transform(query)
#after Stopword removal
remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
wordsData= remover.transform(wordsData)
df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
df.registerTempTable("indices")
wordsData.registerTempTable("words")
qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
if name!='':
exportOnS3(qr,"s3a://redit-preprocessed/",name)
qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
开发者ID:wingsrc,项目名称:benchmark_minhash_lsh,代码行数:16,代码来源:preprocessing.py
示例10: getPipeline
def getPipeline(self, df):
# notify pipeline
self.success('Initializing ML Pipeline ...')
# initialize our tokenizer, we're going to tokenize features
tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
# convert the tokenize data to vectorize data
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
# initialize logistic regression algorithm
lr = LogisticRegression(maxIter=10, regParam=0.01)
# create / initialize the ml pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# fit the pipeline on our training dataframe
model = pipeline.fit(df)
return model
开发者ID:cjzamora,项目名称:machine-learning,代码行数:17,代码来源:MLPipeline.py
示例11: get_top_words
def get_top_words(dataset, signatures):
# TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
# Or translate comments in other languages using the free Microsoft Translate API.
sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))
if sentenceData.rdd.isEmpty():
return dict()
# Tokenize comments.
tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
# Remove duplicate words from comments.
wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])
if wordsData.rdd.isEmpty():
print("[WARNING]: wordsData is empty, sentenceData wasn't.")
return dict()
# Clean comment words by removing puntuaction and stemming.
def clean_word(w):
return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))
wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])
# XXX: Useless with TF-IDF?
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
cleanWordsData = remover.transform(wordsData)
cv = CountVectorizer(inputCol='filtered', outputCol='features')
model = cv.fit(cleanWordsData)
featurizedData = model.transform(cleanWordsData)
idf = IDF(inputCol='features', outputCol='tfidf_features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()
return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
开发者ID:marco-c,项目名称:crashcorrelations,代码行数:40,代码来源:comments.py
示例12: main
def main():
'''
takes one input argument :: Location of the directory for training and test data files.
:return: Print output on console for the area under the ROC curve.
'''
conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training data.
model = pipeline.fit(trainDF)
numFeatures = (1000, 5000, 10000)
regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()
cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)
# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
model_cv = cv.fit(trainDF)
prediction_cv = model_cv.transform(testDF)
print evaluator.evaluate(prediction)
print evaluator.evaluate(prediction_cv)
开发者ID:PranavGoel,项目名称:Python-Spark---Matrix-Multiplication---ML-pipeline,代码行数:39,代码来源:ml_pipeline.py
示例13: run_tf_idf_spark_mllib
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
words = wordsData.select("words").rdd.map(lambda x: x.words)
hashingTF = MllibHashingTF(numFeatures)
tf = hashingTF.transform(words)
tf.cache()
idf = MllibIDF().fit(tf)
tfidf = idf.transform(tf)
# @TODO make this nicer
tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
tmp.registerTempTable("tmp")
old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
tmp.registerTempTable("tmp")
old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
开发者ID:ctavan,项目名称:bbuzz2016,代码行数:22,代码来源:bbuzz2016-backup.py
示例14: main
def main():
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet(training_input)
testDF = sqlCt.read.parquet(testing_input)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
evaluator = BinaryClassificationEvaluator()
# no parameter tuning
hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
model_notuning = pipeline_notuning.fit(trainDF)
prediction_notuning = model_notuning.transform(testDF)
notuning_output = evaluator.evaluate(prediction_notuning)
# for cross validation
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20)
paramGrid = ParamGridBuilder()\
.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
.build()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cvModel = cv.fit(trainDF)
# Make predictions on test documents. cvModel uses the best model found.
best_prediction = cvModel.transform(testDF)
best_output = evaluator.evaluate(best_prediction)
s = str(notuning_output) + '\n' + str(best_output)
output_data = sc.parallelize([s])
output_data.saveAsTextFile(output)
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:38,代码来源:spark_ml_pipline.py
示例15: BaselinePipelineEngine
class BaselinePipelineEngine(PipelineEngine):
@keyword_only
def __init__(self, cv):
super(BaselinePipelineEngine, self).__init__(cv)
self.hashing_tf_map = [pow(2, 20)]
self.lr_map = [0.1, 0.01]
self.stages = self._build_stages()
self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
self.param_grid = self._build_param_grid()
def _build_stages(self):
self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
self.lr = LogisticRegression(maxIter=10, regParam=0.01)
return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
def _build_param_grid(self):
param_grid_builder = ParamGridBuilder()
param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
return param_grid_builder.build()
开发者ID:ngarneau,项目名称:sentiment-analysis,代码行数:23,代码来源:pipelines.py
示例16: BeautifulSoup
review_text = BeautifulSoup(raw_review).text
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 5. Join the words back into one string separated by space,
# and return the result.
return " ".join( meaningful_words)
stops = set(stopwords.words("english"))
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]),
review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
开发者ID:rbkasat,项目名称:CSYE7374_FinalProject,代码行数:31,代码来源:RandomForest_TF-IDF.py
示例17: Tokenizer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("TfIdfExample")\
.getOrCreate()
# $example on$
sentenceData = spark.createDataFrame([
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "label").take(3):
print(features_label)
# $example off$
spark.stop()
开发者ID:1574359445,项目名称:spark,代码行数:31,代码来源:tf_idf_example.py
示例18: preProcess
labeledRdd = sc.parallelize(labeledData)
from pyspark.sql import SQLContext
def preProcess(doc):
clean = doc.replace("<br /><br />"," ")
return clean.lower()
rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1]))
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)
import itertools
lists=dfTrainTok.map(lambda r : r.review).collect()
dictWords=set(itertools.chain(*lists))
dictionaryWords={}
for i,word in enumerate(dictWords):
dictionaryWords[word]=i
dict_broad=sc.broadcast(dictionaryWords)
from pyspark.mllib.linalg import SparseVector
def vectorize(row,dico):
vector_dict={}
for w in row.words:
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:reglog_nocv_simple.py
示例19: cleanLower
def cleanLower(doc):
return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
print "Random split is done"
tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizer,
hashing_tf,
idf,
string_indexer,
dt])
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
# grid=(ParamGridBuilder()
# .baseOn([evaluator.metricName,'precision'])
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:30,代码来源:script2_nocv.py
示例20: Tokenizer
from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("tokenizer_sample") \
.master("local[*]") \
.getOrCreate()
data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")]
inputDF = spark.createDataFrame(data).toDF("id", "input")
tokenizer = Tokenizer(inputCol="input", outputCol="output")
outputDF = tokenizer.transform(inputDF)
outputDF.printSchema()
outputDF.show()
spark.stop
开发者ID:oopchoi,项目名称:spark,代码行数:17,代码来源:tokenizer_sample.py
注:本文中的pyspark.ml.feature.Tokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论