本文整理汇总了Python中sklearn.preprocessing.MultiLabelBinarizer类的典型用法代码示例。如果您正苦于以下问题:Python MultiLabelBinarizer类的具体用法?Python MultiLabelBinarizer怎么用?Python MultiLabelBinarizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MultiLabelBinarizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: fit_images
def fit_images():
client = pymongo.MongoClient('localhost', 27017)
db = client['image_annotation']
responses = db['mapped_responses'].find()
no_labels = db['labels_binary'].find()
numbers = []
for i in no_labels:
numbers.append(set([int(i["number"])]))
train_data = []
labels = []
i=0
mlb = MultiLabelBinarizer()
mlb.fit(numbers)
for index, instance in enumerate(responses):
t_data = instance['hist']['0']
indexes[index] = instance['image_no']
train_data.append(t_data)
label = instance['binary_results']
new_labels = []
for key, value in enumerate(label):
value1 = int(value)
new_labels.append(set([value1]))
new_labels = mlb.transform(new_labels)
labels.append(label)
classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform')
classifier.fit(train_data, labels)
build_dir = getBuildDir()
pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1)
client.close()
开发者ID:sreeram-boyapati,项目名称:image-annotation,代码行数:29,代码来源:classifier.py
示例2: evaluate_solution
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None,
hidden_edges=None):
"""Evaluate the quality of the recovered user profile"""
mse = mean_squared_error(users[observed_index, :],
urecovered[observed_index, :])
if hidden_edges is None or len(hidden_edges) < 1:
return mse, None
labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1]))
gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)])
# gold = np.array([E[e] for e in sorted(hidden_edges)])
eh = sorted(hidden_edges)
heads, tails = zip(*eh)
Cr = np.dot(urecovered, xs.T)
Dr = np.abs(Cr[heads, :] - Cr[tails, :])
# TODO prediction here could be better: instead of predict the k best
# directions all the time, look at revealed edge to compute threshold of
# similarity (i.e replace 0.05)
best_dirs = np.argsort(Dr, 1).astype(int)[:, :2]
pred = []
for all_dir, suggestion in zip(Dr, best_dirs):
my_pred = [suggestion[0]]
if all_dir[suggestion[1]] < 0.05:
my_pred.append(suggestion[1])
pred.append(my_pred)
pred = labeler.fit_transform(pred)
return mse, f1_score(gold, pred, average='samples')
开发者ID:daureg,项目名称:magnet,代码行数:26,代码来源:synth.py
示例3: __init__
def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"):
# [[cat,cat...]...]
self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True)
self.dim = 400
(correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt")
(correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt")
## ---- By mean ---
Xvectors = np.array(self.predict_vector_by_mean(context_categories_train))
Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test))
## ---- By mean --- *
## ---- By SVM ---
corpus_train = [" ".join(i) for i in context_categories_train]
corpus_test = [" ".join(i) for i in context_categories_test]
cv = CountVectorizer(min_df = 1)
X = cv.fit_transform(corpus_train)
##TFIDF
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X)
#Labels
mlb = MultiLabelBinarizer()
mlb.fit(correct_categories_train + correct_categories_test)
Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator
#predict test labels
X_test = cv.transform(corpus_test)
Y_test = mlb.transform(correct_categories_test)
#Y_predict_ovr = self.ovrSVM(X, Y, X_test)
Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
#Y_predict_ovo = self.ovoSVM(X, Y, X_test)
print "---One versus rest---"
print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
开发者ID:pkumusic,项目名称:HCE,代码行数:35,代码来源:SVMs.py
示例4: run_classifier
def run_classifier(sentences, labels, test_docs):
import numpy as np
train_matrix, tfidf = tf_idf_fit_transform(sentences)
test_sentences = doc2sentences(test_docs)
sentence_matrix = tfidf.transform(test_sentences)
print("Shape of sentence matrix : ", sentence_matrix.shape)
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(labels)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import linearSVC
# estimator = SVC(kernel='linear')
estimator = linearSVC()
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(train_matrix, label_matrix)
predictions = classifier.predict(sentence_matrix)
import csv
with open("classified.csv", "w") as fl:
writer = csv.writer(fl)
for i in range(len(test_sentences)):
curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
writer.writerow((test_sentences[i], curr_pred))
开发者ID:sarath1,项目名称:EventExtraction,代码行数:27,代码来源:sentence_classifier.py
示例5: test_multilabel_classification_report
def test_multilabel_classification_report():
n_classes = 4
n_samples = 50
make_ml = make_multilabel_classification
_, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
n_samples=n_samples)
_, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
n_samples=n_samples)
expected_report = """\
precision recall f1-score support
0 0.50 0.67 0.57 24
1 0.51 0.74 0.61 27
2 0.29 0.08 0.12 26
3 0.52 0.56 0.54 27
avg / total 0.45 0.51 0.46 104
"""
lb = MultiLabelBinarizer()
lb.fit([range(4)])
y_true_bi = lb.transform(y_true_ll)
y_pred_bi = lb.transform(y_pred_ll)
for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
report = classification_report(y_true, y_pred)
assert_equal(report, expected_report)
开发者ID:nateyoder,项目名称:scikit-learn,代码行数:28,代码来源:test_classification.py
示例6: ACMClassificator
class ACMClassificator(BaseACMClassificator):
def __init__(self):
self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
self.mlb = MultiLabelBinarizer()
self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
max_leaf_nodes=None,
class_weight=None),
n_jobs=-1
)
def _prepare_problems(self, problems):
return self.vectorizer.transform([p.statement for p in problems])
def fit(self, problems, tags):
nltk.download('punkt', quiet=True)
self.vectorizer.fit([p.statement for p in problems])
mat = self._prepare_problems(problems)
self.mlb = self.mlb.fit(tags)
self.classificator.fit(mat.toarray(), self.mlb.transform(tags))
def predict(self, problems):
mat = self._prepare_problems(problems)
predicted = self.classificator.predict(mat.toarray())
return self.mlb.inverse_transform(predicted)
开发者ID:morojenoe,项目名称:classificator,代码行数:29,代码来源:one_vs_rest_tree.py
示例7: read_all_data
def read_all_data(p):
img_src = "images/"
df = pd.read_pickle("frame_no_stem.pkl")
images = __read_all_images(img_src)
print("Finished reading images")
x_images = []
x_desc = []
y_category = []
all_categories = set()
for asin in df.index.values:
if asin in images:
data = images[asin]
x_images.append(data)
item = df.loc[asin]
x_desc.append(item.description)
cate = item.categories
y_category.append(cate)
for c in cate:
all_categories.add(c)
print("Finished reading dataframe")
mlb = MultiLabelBinarizer()
y_total = mlb.fit_transform(y_category)
x_images = np.array(x_images)
x_desc = np.array(x_desc)
return x_images,x_desc, y_total
开发者ID:jeffwiroj,项目名称:ml_proj,代码行数:32,代码来源:image_classifier.py
示例8: main
def main():
#Explore the data for how many class labels
reviewsDict = {}
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f:
reviewsDict = pickle.load(f)
print "Reviews Dictionary loaded .. "
'''
usefulCountDict = {}
for key, value in reviewsDict.iteritems():
if value not in usefulCountDict:
usefulCountDict[value] = 1
else:
usefulCountDict[value] = usefulCountDict[value]+1
pprint(usefulCountDict)
'''
corpus, target = DictToList(reviewsDict)
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True)
XAll = vectorizer.fit_transform(corpus)
mlb = MultiLabelBinarizer()
yAll = mlb.fit_transform(target)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f:
pickle.dump(XAll, f)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f:
pickle.dump(yAll, f)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f:
pickle.dump(mlb, f)
print "Dumped featrue vectors .... "
开发者ID:yangyang861115,项目名称:Yelp-Project,代码行数:30,代码来源:createFeatureVectorsBinary.py
示例9: get_training_data
def get_training_data(window_size_ms, train_time_sec=30):
#loop until empty input is detected
X = []
y = []
print "Training time for each key is {} seconds".format(train_time_sec)
i = 0
while True:
s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i))
if s: break
j = 0
while j < train_time_sec:
j += (window_size_ms / float(1000))
freq_spect = read_spectral_data_for_time(window_size_ms)
X.append(freq_spect)
y.append([i])
#increment key counter
i += 1
mb = MultiLabelBinarizer()
y = mb.fit_transform(y)
X = np.asarray(X)
y = np.asarray(y)
return X, y
开发者ID:johncava,项目名称:HackAZ_2016,代码行数:27,代码来源:serialize_training_data.py
示例10: load_data
def load_data(config={}):
"""
Load the Reuters dataset.
Returns
-------
data : dict
with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
"""
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]
xs = {'train': [], 'test': []}
xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
xs['test'] = vectorizer.transform(docs['test']).toarray()
ys = {'train': [], 'test': []}
ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
for doc_id in train])
ys['test'] = mlb.transform([reuters.categories(doc_id)
for doc_id in test])
data = {'x_train': xs['train'], 'y_train': ys['train'],
'x_test': xs['test'], 'y_test': ys['test'],
'labels': globals()["labels"]}
return data
开发者ID:MartinThoma,项目名称:algorithms,代码行数:32,代码来源:reuters.py
示例11: generateTrainFeatures
def generateTrainFeatures(L):
"""
This function generates the training data features and its target labels.
Input: L : The number of training data
Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the
L training samples
trainY -> (L * 185) numpy matrix representing the target class of the training samples
Logic:
The input text is read, preprocessed to remove stop words, and is appended to a list.
Similarly, each of the target class values are read into a list.
Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent
words.
The multi-label classification algorithms require a target Y variable of the form,
(nsamples * nclasses), multilabel binarizer is used for converting the list of classes
to a matrix form.
"""
global classOrder
X = []
Y = []
# read the input
for i in range(L):
categories = raw_input()
target = [int(y) for y in categories.split(" ")]
del target[0]
meaningfulWords = readInput()
Y.append(target)
X.append(meaningfulWords)
# construct TF-IDF matrix representing the features
trainX = vectorizer.fit_transform(X).toarray()
# convert the target label list to a suitable matrix form
mlb = MultiLabelBinarizer()
trainY = mlb.fit_transform(Y)
# for representing the order of the classes
classOrder = mlb.classes_
return (trainX, trainY)
开发者ID:hpam1,项目名称:Machine-Learning,代码行数:35,代码来源:labeler.py
示例12: __init__
class VectorizedData:
""" Simple container that holds the input dataset
in a sklearn-friendly form, with X, y numpy vectors.
TODO: we ignore # of matches for each fbpath """
def __init__(self, data, Xdict=None, Ydict=None):
fdict = [q_to_fdict(q) for q in data]
lset = [q_to_lset(q) for q in data]
if Xdict is None:
self.Xdict = DictVectorizer()
self.X = self.Xdict.fit_transform(fdict)
else:
self.Xdict = Xdict
self.X = self.Xdict.transform(fdict)
if Ydict is None:
self.Ydict = MultiLabelBinarizer()
self.Y = self.Ydict.fit_transform(lset)
else:
self.Ydict = Ydict
# Filter out data with unknown labels, MultiLabelBinarizer() cannot
# handle this
known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
lset_n = sum([len(ls) for ls in lset])
known_lset_n = sum([len(ls) for ls in known_lset])
if known_lset_n < lset_n:
print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)
self.Y = self.Ydict.transform(known_lset)
def cfier_score(self, cfier, scorer):
""" Measure cfier performance on this dataset.
scorer -> lambda cfier, X: cfier.predict_proba(X)
(or decision_function when probabilities not predicted) """
skl_score = cfier.score(self.X.toarray(), self.Y)
# XXX: Matched paths might/could be weighted by their nMatches too...
# Measure prediction performance
Ypred = cfier.predict(self.X.toarray())
n_q = float(np.size(self.Y, axis=0))
# number of questions where all correct paths have been recalled
recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
# number of questions where at least one correct path has been recalled
recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
# number of *PATHS* (not q.) that were correct
precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))
# Measure scoring performance
Yscores = scorer(cfier, self.X.toarray())
# MRR of first correct path
mrr = mrr_by_score(self.Y, Yscores)
# number of questions where at least one correct path has been recalled in top N paths
# TODO
return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
开发者ID:AmitShah,项目名称:yodaqa,代码行数:59,代码来源:fbpathtrain.py
示例13: perform_train_test_split
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
train_size=ds.DEFAULT_TRAININGSET_SIZE):
"""
Get all document_ids of given database and split's it according to given
train_size.
The tricky part is that we n
:param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
:param train_size: Size in percentage [0,1] of the training set.
:return splitted_dataset - List of lists
[[DEFAULT_DATASET_LIST_INDEX_TRAINING],
[DEFAULT_DATASET_LIST_INDEX_TEST]]
"""
database = db.couch_database(db_name)
all_docs = database.getAllDocumentsFromDatabase()
doc_ids_list = []
all_tag_list = []
i = 0
for row in all_docs.rows:
document = row.doc
#append the document id to doc_ids_list
doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
tag_list = []
#if document has tags than split and add them
if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
for tag in tags_list:
#remove the closing tag (last item)
tag_list.append(tag[:-1])
#append the list of document tags to all_tag_list
all_tag_list.append(tag_list)
i += 1
if i > 10000:
break
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(all_tag_list)
print(len(doc_ids_list))
splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
train_size=0.8, random_state=42,
stratify=tags_encoded)
开发者ID:davcem,项目名称:stackexchange_text_classification,代码行数:59,代码来源:classifier_inspect_splits.py
示例14: createDataMatrix
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
tokenizer_case_preserve = Tokenizer(preserve_case=True)
tokenizer = Tokenizer(preserve_case=False)
handmade_features, cll, cll2 = [], [], []
for tweet in tweetText:
feat = []
feat.append(exclamations(tweet))
feat.append(questions(tweet))
feat.append(questions_and_exclamation(tweet))
feat.append(emoticon_negative(tweet))
feat.append(emoticon_positive(tweet))
words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
feat.append(allCaps(words))
feat.append(elongated(words))
feat.append(questions_and_exclamation(words[-1]))
handmade_features.append(np.array(feat))
words = tokenizer.tokenize(tweet)
words = [word.strip("_NEG") for word in words]
cll.append(getClusters(voca_clusters, words))
#cll2.append(getClusters(voca_handmade, words))
bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
handmade_features = np.array(handmade_features)
mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
#mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
#cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
# sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
# sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
pos_features = csr_matrix(pos_features)
handmade_features = csr_matrix(handmade_features)
# ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features,
# sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
# ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)
# print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape,
# sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
y=[]
for i in categories:
if i=='positive':
y.append(1)
elif i == 'negative':
y.append(-1)
elif i == 'UNKNOWN':
y.append(0)
else:
print i
ffeatures = normalize(ffeatures)
# ffeatures, y = shuffle(ffeatures,y)
return ffeatures, y
开发者ID:balikasg,项目名称:SemEval2016-Twitter_Sentiment_Evaluation,代码行数:58,代码来源:my_utils.py
示例15: xval
def xval(clf, x, y, train_index, test_index):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(x_train, y_train)
mlb = MultiLabelBinarizer()
y_pred = clf.predict_proba(x_test)
mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
evals = clf.get_num_evals()
return mse, acc, evals
开发者ID:shehzadqureshi,项目名称:NeuralNetDynamicOSI,代码行数:10,代码来源:test_basic_4bit_cv.py
示例16: print_report
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags):
predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags)
mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags)
testing_tags = mlb.transform(testing_tags)
predicted_tags = mlb.transform(predicted_tags)
print(name_classificator)
print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_))
print('label ranking average precision score =',
label_ranking_average_precision_score(testing_tags, predicted_tags))
print('\n', ('#'*100), '\n')
开发者ID:morojenoe,项目名称:classificator,代码行数:10,代码来源:report.py
示例17: test_BRKnna_predict_dense
def test_BRKnna_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:11,代码来源:test_BRKNN.py
示例18: test_BRKnna_no_labels_take_closest
def test_BRKnna_no_labels_take_closest(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
print(pred)
np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:11,代码来源:test_BRKNN.py
示例19: test_BRKnnb_predict_two_samples
def test_BRKnnb_predict_two_samples(self):
data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:11,代码来源:test_BRKNN.py
示例20: run_classifierAccuracy
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \
"Tornado", "Tsunami", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
health_labels = ["Epidemic", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", \
"injured_or_dead_people"]
conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \
"infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
import numpy as np
curr_labels = all_labels
trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=curr_labels)
train_label_matrix = mlb.fit(trainLabels)
print("Labels : ", mlb.classes_)
train_label_matrix = mlb.transform(trainLabels)
test_label_matrix = mlb.transform(testLabels)
print("Shape of label matrix : ", test_label_matrix.shape)
train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
test_matrix = tfidf.transform(testSentences)
print("Shape of sentence matrix : ", test_matrix.shape)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
# estimator = LinearSVC()
estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(train_matrix, train_label_matrix)
predictions = classifier.predict(test_matrix)
from sklearn.metrics import f1_score, precision_score, recall_score
print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
开发者ID:sarath1,项目名称:EventExtraction,代码行数:53,代码来源:sentence_classifier.py
注:本文中的sklearn.preprocessing.MultiLabelBinarizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论