本文整理汇总了Python中pyspark.ml.feature.StringIndexer类的典型用法代码示例。如果您正苦于以下问题:Python StringIndexer类的具体用法?Python StringIndexer怎么用?Python StringIndexer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StringIndexer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
开发者ID:yokeyong,项目名称:atap,代码行数:34,代码来源:sc_classification.py
示例2: testClassification
def testClassification(data):
# Train a GradientBoostedTrees model.
stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
si_model = stringIndexer.fit(data)
td = si_model.transform(data)
rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)
trainData,testData = td.randomSplit([0.8,0.2],13)
predictionDF = rf.fit(trainData).transform(testData)
selected = predictionDF\
.select('label','indexLabel','prediction','rawPrediction','probability')
for row in selected.collect():
print row
scoresAndLabels = predictionDF\
.map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
for sl in scoresAndLabels.collect():
print sl
evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
metric = evaluator.evaluate(selected)
print metric
开发者ID:WeihuaLei,项目名称:LearnSpark,代码行数:25,代码来源:credit_prediction.py
示例3: train_random_forest
def train_random_forest(df):
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(df)
td = si_model.transform(df)
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
seed=int(random.random()))
return rf, rf.fit(td)
开发者ID:ApplyHiTech,项目名称:DataScienceHW1,代码行数:7,代码来源:classify.py
示例4: build_decisionTree
def build_decisionTree(path):
df = load_data(path)
avg_age=find_avg_age(df)
df = data_preparation(df, avg_age)
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)
df.show(truncate=False)
dt = DecisionTreeClassifier(labelCol='indexed')
grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
开发者ID:PranavGoel,项目名称:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代码行数:28,代码来源:spark.py
示例5: build_randomForest
def build_randomForest(path):
df = load_data(path)
avg_age=find_avg_age(df)
df = data_preparation(df, avg_age)
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)
df.show()
rdf = RandomForestClassifier(labelCol='indexed')
grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
.addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = rdf.fit(df)
prediction = cvModel.transform(df)
prediction.show()
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
开发者ID:PranavGoel,项目名称:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代码行数:28,代码来源:spark.py
示例6: mapClickCategoricalFeatures
def mapClickCategoricalFeatures():
indexed = ""
df = getDataFrame(CLICKS_HDPFILEPATH)
df.persist(StorageLevel.DISK_ONLY)
print df.columns
#select columns to be mapped
click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"]
for col in click_cols:
if(indexed == ""):
indexed = df
print indexed
outcol = col+"Index"
indexer = StringIndexer(inputCol=col, outputCol=outcol)
indexed = indexer.fit(indexed).transform(indexed)
indexed.show()
indexed.persist(StorageLevel.DISK_ONLY)
#indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv")
indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")
开发者ID:ashishsjsu,项目名称:Spark101,代码行数:32,代码来源:extraction2.py
示例7: mapPublisherCategoricalFeatures
def mapPublisherCategoricalFeatures():
indexed = ""
df = getDataFrame(PUBLISHERS_HDPFILEPATH)
df.persist(StorageLevel.DISK_ONLY)
print df.columns
publisher_cols = ["C0", "C1", "C2", "C3"]
for col in publisher_cols:
if(indexed == ""):
indexed = df
print indexed
outcol = col+"Index"
#stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe
indexer = StringIndexer(inputCol=col, outputCol=outcol)
#fit and transform the columns using indexer
indexed = indexer.fit(indexed).transform(indexed)
indexed.show()
indexed.persist(StorageLevel.DISK_ONLY)
indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")
开发者ID:ashishsjsu,项目名称:Spark101,代码行数:29,代码来源:extraction2.py
示例8: run
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
print lp_data.count()
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
td = labelIndexer.transform(lp_data)
label2index = {}
for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
key=lambda x: x[0]):
label2index[int(each[0])] = int(each[1])
print label2index
featureIndexer = \
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)
rf = get_model()
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
model = pipeline.fit(lp_train)
lp_check = lp_data.filter(lp_data.date2>start2)
predictions = model.transform(lp_check)
predictions = val(predictions, label2index, sql_context)
if is_pred:
predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
for each in predictions.take(10):
print each
开发者ID:hongbin0908,项目名称:bintrade,代码行数:30,代码来源:diff_feature_cls.py
示例9: label
def label(df, column):
"""
Create a labeled column.
"""
indexer = StringIndexer(inputCol=column, outputCol=column+'_label')
df = indexer.fit(df).transform(df)
return df
开发者ID:ribonj,项目名称:lsir,代码行数:7,代码来源:ml.py
示例10: indexStringColumns
def indexStringColumns(df, cols):
#variable newdf will be updated several times
newdata = df
for c in cols:
si = StringIndexer(inputCol=c, outputCol=c+"-x")
sm = si.fit(newdata)
newdata = sm.transform(newdata).drop(c)
newdata = newdata.withColumnRenamed(c+"-x", c)
return newdata
开发者ID:raul-arrabales,项目名称:Spark-Hands-on,代码行数:9,代码来源:Session6.py
示例11: events
def events(df,column_name):
i = column_name+"I"
v = column_name+"V"
stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
model = stringIndexer.fit(df)
indexed = model.transform(df)
encoder = OneHotEncoder(inputCol=i, outputCol=v)
encoded = encoder.transform(indexed)
return encoded
开发者ID:liber-pater,项目名称:ProjectThales,代码行数:9,代码来源:GdeltDecisionTree.py
示例12: indexStringColumns
def indexStringColumns(df, cols):
from pyspark.ml.feature import StringIndexer
#variable newdf will be updated several times
newdf = df
for c in cols:
si = StringIndexer(inputCol=c, outputCol=c+"-num")
sm = si.fit(newdf)
newdf = sm.transform(newdf).drop(c)
newdf = newdf.withColumnRenamed(c+"-num", c)
return newdf
开发者ID:AkiraKane,项目名称:first-edition,代码行数:10,代码来源:ch08-listings.py
示例13: oneHotEncoding
def oneHotEncoding(self, df, input_col):
stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
model = stringInd.fit(df)
td = model.transform(df)
encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
final_encoding = encoder.transform(td).select(df.id, 'features').cache()
conv_udf = udf(lambda line: Vectors.dense(line).tolist())
final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()
return final_encoding
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:11,代码来源:anomaly_detection.py
示例14: test_string_indexer_handle_invalid
def test_string_indexer_handle_invalid(self):
df = self.spark.createDataFrame([
(0, "a"),
(1, "d"),
(2, None)], ["id", "label"])
si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
stringOrderType="alphabetAsc")
model1 = si1.fit(df)
td1 = model1.transform(df)
actual1 = td1.select("id", "indexed").collect()
expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
self.assertEqual(actual1, expected1)
si2 = si1.setHandleInvalid("skip")
model2 = si2.fit(df)
td2 = model2.transform(df)
actual2 = td2.select("id", "indexed").collect()
expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
self.assertEqual(actual2, expected2)
开发者ID:Brett-A,项目名称:spark,代码行数:20,代码来源:test_feature.py
示例15: SQLContext
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))
print "Text is cleaned"
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])
print "Random split is done"
tokenizerNoSw = tr.NLTKWordPunctTokenizer(
inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script1.py
示例16: load_data_frame
data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
if train:
data = data.map(
lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
'class_'+str(line[0]),int(line[0])) )
else:
# Test data gets dummy labels. We need the same structure as in Train data
data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) )
return sqlcontext.createDataFrame(data, ['features', 'category','label'])
train_df = load_data_frame("train.csv")
test_df = load_data_frame("test.csv", shuffle=False, train=False)
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol="category", outputCol="index_category")
fitted_indexer = string_indexer.fit(train_df)
indexed_df = fitted_indexer.transform(train_df)
from distkeras.transformers import *
from pyspark.ml.feature import OneHotEncoder
####OneHot
nb_classes = 9
encoder = OneHotTransformer(nb_classes, input_col='label', output_col="label_encoded")
dataset_train = encoder.transform(indexed_df)
dataset_test = encoder.transform(test_df)
###encoder
from pyspark.ml.feature import MinMaxScaler
transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \
o_min=0.0, o_max=250.0, \
开发者ID:ChienHsiung,项目名称:python,代码行数:31,代码来源:spark101.py
示例17: time
# In[326]:
print "Creating sparse vectors for all data based on this new dictionary"
t0 = time()
dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTrainSelect.take(1)
dfTestSelect.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[328]:
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)
# In[329]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)
# In[330]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:30,代码来源:script3_bis.py
示例18: WHEN
CASE
WHEN (pickup_hour <= 6 OR pickup_hour >= 20) THEN "Night"
WHEN (pickup_hour >= 7 AND pickup_hour <= 10) THEN "AMRush"
WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon"
WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush"
END as TrafficTimeBins
FROM taxi_test
"""
taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement)
## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY
taxi_df_test_with_newFeatures.cache()
taxi_df_test_with_newFeatures.count()
## INDEX AND ONE-HOT ENCODING
stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex")
model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above
indexed = model.transform(taxi_df_test_with_newFeatures)
encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec")
encoded1 = encoder.transform(indexed)
stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex")
model = stringIndexer.fit(encoded1)
indexed = model.transform(encoded1)
encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec")
encoded2 = encoder.transform(indexed)
stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex")
model = stringIndexer.fit(encoded2)
indexed = model.transform(encoded2)
encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
开发者ID:MahsaBadami,项目名称:Azure-MachineLearning-DataScience,代码行数:31,代码来源:ConsumeGBNYCReg.py
示例19: StringIndexer
pandas_df['dayofweek'] = pandas_df['Dates'].dt.dayofweek
pandas_df['week'] = pandas_df['Dates'].dt.weekofyear
pandas_df['x_sim'] = pandas_df['X'].str[1:8]
pandas_df['X'] = pandas_df['X'].str[1:8]
pandas_df['y_sim'] = pandas_df['Y'].str[0:6]
pandas_df['X'] = pd.to_numeric(pandas_df['X'])
pandas_df['Y'] = pd.to_numeric(pandas_df['Y'])
pandas_df['x_sim'] = pd.to_numeric(pandas_df['x_sim'])
pandas_df['y_sim'] = pd.to_numeric(pandas_df['y_sim'])
#send back to the RDD
data_df = sqlContext.createDataFrame(pandas_df)
#encode the police dept as a feature
stringIndexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Index")
model = stringIndexer.fit(data_df)
indexed = model.transform(data_df)
encoder = OneHotEncoder(dropLast=False, inputCol="PdDistrict_Index", outputCol="pd")
encoded = encoder.transform(indexed)
#remove data_df from memory
data_df.unpersist()
#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)
开发者ID:mattosinski,项目名称:big-data-seminar-pyspark,代码行数:28,代码来源:sparkTestv6.py
示例20: time
# In[16]:
print "Creating feature vectors"
t0 = time()
dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
tt = time() - t0
print "Dataframe created in {} second".format(round(tt,3))
# In[19]:
print "Indexing labels"
t0 = time()
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainVec)
dfTrainIdx = string_indexer_model.transform(dfTrainVec)
dfTrainIdx.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[20]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='featureVectors', labelCol='target_indexed', maxDepth=10)
# In[21]:
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:30,代码来源:script4.py
注:本文中的pyspark.ml.feature.StringIndexer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论