本文整理汇总了Python中sklearn.datasets.base.Bunch类的典型用法代码示例。如果您正苦于以下问题:Python Bunch类的具体用法?Python Bunch怎么用?Python Bunch使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Bunch类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: shuffleData
def shuffleData(self, res):
shuffle(res)
train = Bunch()
train.data = map(lambda x:x[1], res)
train.target = map(lambda x:x[0], res)
train.target_names = self.names
return train
开发者ID:anantauprety,项目名称:sentiment-analysis,代码行数:7,代码来源:sentiment_data.py
示例2: gen_tf_idf_space
def gen_tf_idf_space():
bunch = read_object(train_data)
tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
transformer = TfidfTransformer()
tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
tf_idf_space.vocabulary = vectorizer.vocabulary_
save_object(tf_idf_space_data, tf_idf_space)
开发者ID:Eric-aihua,项目名称:MachineLearning,代码行数:10,代码来源:n_bayes_main.py
示例3: calc_tfidf
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
data_set = joblib.load(trainsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
开发者ID:wadeallstar,项目名称:python-fraud-detect,代码行数:13,代码来源:process_tool.py
示例4: testset_tfidf
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
data_set = joblib.load(testsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
return wordbag
开发者ID:wadeallstar,项目名称:python-fraud-detect,代码行数:13,代码来源:process_tool.py
示例5: train_bags
def train_bags(token_path,filename,wordbag_path):
data_set = Bunch(tatget_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+"/"+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"/"+filename, compress=3)
开发者ID:wadeallstar,项目名称:python-fraud-detect,代码行数:15,代码来源:process_tool.py
示例6: test_bunch_pickle_generated_with_0_16_and_read_with_0_17
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
bunch = Bunch(key='original')
# This reproduces a problem when Bunch pickles have been created
# with scikit-learn 0.16 and are read with 0.17. Basically there
# is a suprising behaviour because reading bunch.key uses
# bunch.__dict__ (which is non empty for 0.16 Bunch objects)
# whereas assigning into bunch.key uses bunch.__setattr__. See
# https://github.com/scikit-learn/scikit-learn/issues/6196 for
# more details
bunch.__dict__['key'] = 'set from __dict__'
bunch_from_pkl = loads(dumps(bunch))
# After loading from pickle the __dict__ should have been ignored
assert_equal(bunch_from_pkl.key, 'original')
assert_equal(bunch_from_pkl['key'], 'original')
# Making sure that changing the attr does change the value
# associated with __getitem__ as well
bunch_from_pkl.key = 'changed'
assert_equal(bunch_from_pkl.key, 'changed')
assert_equal(bunch_from_pkl['key'], 'changed')
开发者ID:TaihuaLi,项目名称:scikit-learn,代码行数:19,代码来源:test_base.py
示例7: execute_NM_predict
def execute_NM_predict():
test_bunch = read_object(test_data)
test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
tdm=[], vocabulary={})
tf_idf_bunch = read_object(tf_idf_space_data)
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
vocabulary=tf_idf_bunch.vocabulary)
transformer = TfidfTransformer()
test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
test_space.vocabulary = tf_idf_bunch.vocabulary
clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
#预测结果
predicted = clf.predict(test_space.tdm)
#对结果进行更加友好的打印
for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
开发者ID:Eric-aihua,项目名称:MachineLearning,代码行数:20,代码来源:n_bayes_main.py
示例8: scatter3d
def scatter3d(X, fig=None,ax=None ,color='b',cs=None, colorsMap='jet'):
if (cs is not None):
cm = plt.get_cmap(colorsMap)
cNorm = matplotlib.colors.Normalize(vmin=min(cs), vmax=max(cs))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
if (ax is None):
fig = plt.figure()
ax = Axes3D(fig)
if (cs is None):
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=color)
else:
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=scalarMap.to_rgba(cs))
scalarMap.set_array(cs)
fig.colorbar(scalarMap)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
b=Bunch()
b.fig=fig
b.ax=ax
return b
开发者ID:ohadfel,项目名称:Baus,代码行数:22,代码来源:vis.py
示例9: reload
import os
from sklearn.datasets.base import Bunch
from sklearn.externals import joblib
import jieba
from sklearn.feature_extraction.text import HashingVectorizer
reload(sys)
# sys.setdefaultencoding('utf-8')
token_path = "token"+"/"
#次袋语料路径
wordbag_path = "wordbag"+"/"
#是引用bunch存储
data_set = Bunch(target_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"train_set1124.data", compress=3)
#验证
开发者ID:wadeallstar,项目名称:python-fraud-detect,代码行数:31,代码来源:train_bags.py
示例10: reload
from sklearn.feature_extraction.text import TfidfVectorizer
reload(sys)
#导入训练预料
data_set={}
#训练语料集路径
train_path='text_corpus1_wordbag/train_set.data'
file_obj=open(train_path,'rb')
#读取持久化后的对象
data_set=pickle.load(file_obj)
file_obj.close()
#定义词袋数据结构
wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name=data_set.target_name
wordbag.label=data_set.label
wordbag.filenames=data_set.filenames
#构建语料
corpus=data_set.contents
#从文件导入停用词表
stpwrdpath='extra_dict/hlt_stop_words.txt'
stpwrd_dic=open(stpwrdpath,'rb')
stpwrd_content=stpwrd_dic.read()
#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
开发者ID:Pengfei-Zhu,项目名称:DataMining,代码行数:31,代码来源:tf-idffinal.py
示例11: main
def main():
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10 # max(10, args.fixk)
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
#### TESTING THE CLASSIFERS
test_target, test_data = split_data_sentences(data.test,sent_detector)
test_data_bow = vct.transform(test_data)
#pred_sent = sent_clf.predict(test_data_bow)
pred_ora = exp_clf.predict(test_data_bow)
y_probas = sent_clf.predict_proba(test_data_bow)
pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
## just based on one class probability
# order = np.argsort(y_probas[:,0])
order = np.argsort(y_probas.max(axis=1))
print "ORACLE\tSENTENCE\tMAX-SENT"
# for i in order[:500]:
# print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
for i in order[-500:]:
print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
print "Class distribution: %s" % pred_sent.sum()
print "Size of data: %s" % pred_sent.shape[0]
sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
#.........这里部分代码省略.........
开发者ID:mramire8,项目名称:active,代码行数:101,代码来源:test_sent.py
示例12: main
def main():
print args
print
accuracies = defaultdict(lambda: [])
ora_accu = defaultdict(lambda: [])
oracle_accuracies =[]
ora_cm = defaultdict(lambda: [])
lbl_dit = defaultdict(lambda: [])
aucs = defaultdict(lambda: [])
x_axis = defaultdict(lambda: [])
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
parameters = experiment_utils.parse_parameters_mat(args.cost_model)
print "Cost Parameters %s" % parameters
cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
print "\nCost Model: %s" % cost_model.__class__.__name__
### SENTENCE TRANSFORMATION
if args.train == "twitter":
sent_detector = TwitterSentenceTokenizer()
else:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
if not args.fulloracle:
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
if not args.fulloracle:
print "Training expert documents:%s" % len(expert_data.oracle.train.data)
labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
else:
# expert_data.data = np.concatenate((data.train.data, data.test.data))
# expert_data.target = np.concatenate((data.train.target, data.test.target))
expert_data.data =data.train.data
expert_data.target = data.train.target
expert_data.target_names = data.train.target_names
labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
expert_data.bow = vct.transform(sent_train)
expert_data.target = labels
expert_data.data = sent_train
exp_clf.fit(expert_data.bow, expert_data.target)
#.........这里部分代码省略.........
开发者ID:mramire8,项目名称:active,代码行数:101,代码来源:sent_unc.py
示例13: main
#.........这里部分代码省略.........
print ("Anytime active learning experiment - use objective function to pick data")
t0 = time.time()
tac = []
tau = []
### experiment starts
for t in range(args.trials):
trial_accu = []
trial_aucs = []
print "*" * 60
print "Trial: %s" % t
if args.student in "anyunc":
student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model)
elif args.student in "lambda":
student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
elif args.student in "anyzero":
student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model)
else:
raise ValueError("Oops! We do not know that anytime strategy. Try again.")
print "\nStudent: %s " % student
train_indices = []
neutral_text = [] # save the raw text of the queries
neutral_data = [] # save the xik vectors
train_x = []
train_y = []
neu_x = [] # data to train the classifier
neu_y = np.array([])
pool = Bunch()
pool.data = data.train.bow.tocsr() # full words, for training
pool.text = data.train.data
# pool.fixk = data.train.bowk.tocsr() # k words BOW for querying
pool.target = data.train.target
pool.predicted = []
# pool.kwords = np.array(data.train.kwords) # k words
pool.remaining = set(range(pool.data.shape[0])) # indices of the pool
bootstrapped = False
current_cost = 0
iteration = 0
query_index = None
query_size = None
while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
util = []
if not bootstrapped:
## random from each bootstrap
bt = randomsampling.BootstrapFromEach(t * 10)
query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
bootstrapped = True
query = pool.data[query_index]
print "Bootstrap: %s " % bt.__class__.__name__
print
else:
# print "pick instance"
## chose returns: index, k
## util returns: utility, k, unc
query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
query_index = [a for a, b in query_chosen]
开发者ID:mramire8,项目名称:active,代码行数:67,代码来源:anytime.py
示例14: get_data
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
import copy
min_size = 10
args.fixk = None
data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = clean_html(data.train.data)
data.test.data = clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
print len(sent_train)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
print expert_data.oracle.train.bow.shape
# exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf = copy.copy(clf)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = None
# if args.cheating:
sent_clf = copy.copy(clf)
# sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
return exp_clf, data, vct, sent_clf, expert_data
开发者ID:mramire8,项目名称:active,代码行数:67,代码来源:score_distribution.py
示例15: Bunch
from sklearn.datasets.base import Bunch
# 分词后分类语料库路径
seg_path = "text_corpus_segment/"
# 词袋语料路径
wordbag_path = "text_corpus_wordbag/"
if not os.path.exists(wordbag_path):
os.makedirs(wordbag_path)
# Bunch类提供一种key,value的对象形式
# target_name:所有分类名称列表
# label:每个文件的分类标签列表
# filenames:文件名称
# contents:文件内容
data_set = Bunch(target_name=[], label=[], filenames=[], contents=[])
# 获取seg_path下的所有子分类
class_list = os.listdir(seg_path)
data_set.target_name = class_list
# 获取每个子目录下所有的文件
for mydir in class_list:
class_path = seg_path + mydir + "/"
file_list = os.listdir(class_path) # 获取class_path下的所有文件
for file_name in file_list:
file_path = class_path + file_name
data_set.filenames.append(file_path) # 把文件路径附加到数据集中
data_set.label.append(data_set.target_name.index(mydir)) # 把文件分类标签附加到数据集中
with open(file_path, 'r', encoding='gb18030') as file:
seg_corpus = file.read() # 读取语料
开发者ID:longcd,项目名称:Text-Classification-System,代码行数:31,代码来源:train_bags.py
示例16: main
def main():
accuracies = defaultdict(lambda: [])
aucs = defaultdict(lambda: [])
x_axis = defaultdict(lambda: [])
vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = max(100, args.fixk)
fixk_saved = "{0}{1}.p".format(args.train, args.fixk)
try:
fixk_file = open(fixk_saved, "rb")
data = pickle.load(fixk_file)
except IOError:
data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
fixk_file = open(fixk_saved, "wb")
pickle.dump(data, fixk_file)
# data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
parameters = parse_parameters_mat(args.cost_model)
print "Cost Parameters %s" % parameters
cost_model = set_cost_model(args.cost_function, parameters=parameters)
print "\nCost Model: %s" % cost_model.__class__.__name__
#### STUDENT CLASSIFIER
clf = linear_model.LogisticRegression(penalty="l1", C=1)
print "\nStudent Classifier: %s" % clf
#### EXPERT CLASSIFIER
exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3)
exp_clf.fit(data.test.bow, data.test.target)
expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
cost_function=cost_model.cost_function)
print "\nExpert: %s " % expert
#### ACTIVE LEARNING SETTINGS
step_size = args.step_size
bootstrap_size = args.bootstrap
evaluation_points = 200
print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
evaluation_points, args.fixk,
min_size))
print ("Cheating experiment - use full uncertainty query k words")
t0 = time.time()
### experiment starts
tx =[]
tac = []
tau = []
for t in range(args.trials):
trial_accu =[]
trial_aucs = []
trial_x_axis = []
print "*" * 60
print "Trial: %s" % t
student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
print "\nStudent: %s " % student
train_indices = []
train_x = []
train_y = []
pool = Bunch()
pool.data = data.train.bow.tocsr() # full words, for training
pool.fixk = data.train.bowk.tocsr() # k words BOW for querying
pool.target = data.train.target
pool.predicted = []
pool.kwords = np.array(data.train.kwords) # k words
pool.remaining = set(range(pool.data.shape[0])) # indices of the pool
bootstrapped = False
current_cost = 0
iteration = 0
while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
if not bootstrapped:
#.........这里部分代码省略.........
开发者ID:mramire8,项目名称:active,代码行数:101,代码来源:unckcheatv2.py
示例17: load_mask_images
import numpy as np
from skimage import io
from sklearn.datasets.base import Bunch
from dip.load_data import load_image_files, load_mask_images
from dip.mask import bounding_rect_of_mask
datasets = load_mask_images()
data = []
for f, mask in zip(
datasets.filenames,
load_image_files(datasets.filenames),
):
# rect: (min_x, max_x, min_y, max_x)
rect = bounding_rect_of_mask(mask, negative=True)
data.append(list(rect))
print('{0}: {1}'.format(f, rect))
bunch = Bunch(name='mask rects')
bunch.data = np.array(data)
bunch.filenames = datasets.filenames
bunch.target = datasets.target
bunch.target_names = datasets.target_names
bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)'
with gzip.open('rects.pkl.gz', 'wb') as f:
pickle.dump(bunch, f)
开发者ID:wkentaro,项目名称:d-image-pipeline,代码行数:29,代码来源:mask_to_rect.py
示例18: fetch_mixed_gambles
def fetch_mixed_gambles(n_subjects=1, data_dir=None, url=None, resume=True, return_raw_data=False, verbose=0):
"""Fetch Jimura "mixed gambles" dataset.
Parameters
----------
n_subjects: int, optional (default 1)
The number of subjects to load. If None is given, all the
subjects are used.
data_dir: string, optional (default None)
Path of the data directory. Used to force data storage in a specified
location. Default: None.
url: string, optional (default None)
Override download URL. Used for test only (or if you setup a mirror of
the data).
resume: bool, optional (default True)
If true, try resuming download if possible.
verbose: int, optional (default 0)
Defines the level of verbosity of the output.
return_raw_data: bool, optional (default True)
If false, then the data will transformed into and (X, y) pair, suitable
for machine learning routines. X is a list of n_subjects * 48
Nifti1Image objects (where 48 is the number of trials),
and y is an array of shape (n_subjects * 48,).
smooth: float, or list of 3 floats, optional (default 0.)
Size of smoothing kernel to apply to the loaded zmaps.
Returns
-------
data: Bunch
Dictionary-like object, the interest attributes are :
'zmaps': string list
Paths to realigned gain betamaps (one nifti per subject).
'gain': ..
If make_Xy is true, this is a list of n_subjects * 48
Nifti1Image objects, else it is None.
'y': array of shape (n_subjects * 48,) or None
If make_Xy is true, then this is an array of shape
(n_subjects * 48,), else it is None.
References
----------
[1] K. Jimura and R. Poldrack, "Analyses of regional-average activation
and multivoxel pattern information tell complementary stories",
Neuropsychologia, vol. 50, page 544, 2012
"""
if n_subjects > 16:
warnings.warn("Warning: there are only 16 subjects!")
n_subjects = 16
if url is None:
url = "https://www.nitrc.org/frs/download.php/7229/" "jimura_poldrack_2012_zmaps.zip"
opts = dict(uncompress=True)
files = [("zmaps%ssub%03i_zmaps.nii.gz" % (os.sep, (j + 1)), url, opts) for j in range(n_subjects)]
data_dir = _get_dataset_dir("jimura_poldrack_2012_zmaps", data_dir=data_dir)
zmap_fnames = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
data = Bunch(zmaps=zmap_fnames)
if not return_raw_data:
X, y, mask_img = _load_mixed_gambles(map(nibabel.load, data.zmaps))
data.zmaps, data.gain, data.mask_img = X, y, mask_img
return data
开发者ID:LisaLeroi,项目名称:nilearn,代码行数:65,代码来源:func.py
示例19: main
#.........这里部分代码省略.........
#expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
raise Exception("We do not know linear yet!!")
elif "log" in args.expert:
expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
elif "direct" in args.expert:
expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
else:
raise Exception("We need a defined cost function options [fixed|log|linear]")
#expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
print "\nExpert: %s " % expert
#### ACTIVE LEARNING SETTINGS
step_size = args.step_size
bootstrap_size = args.bootstrap
evaluation_points = 200
eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points
print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
evaluation_points, args.fixk,
50))
t0 = time.time()
### experiment starts
for t in range(args.trials):
print "*" * 60
print "Trial: %s" % t
# TODO shuffle the data??
#student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget,
# seed=t)
student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
print "\nStudent: %s " % student
train_indices = []
train_x = []
train_y = []
pool = Bunch()
pool.data = data.train.bow.tocsr() # full words, for training
pool.fixk = data.train.bowk.tocsr() # k words BOW for querying
pool.target = data.train.target
pool.predicted = []
pool.kwords = np.array(data.train.kwords) # k words
pool.remaining = set(range(pool.data.shape[0])) # indices of the pool
#for x in pool.fixk:
# print x.todense().sum()
bootstrapped = False
current_cost = 0
iteration = 0
while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
if not bootstrapped:
## random bootstrap
#bt = randomsampling.BootstrapRandom(random_state=t * 10)
## random from each bootstrap
bt = randomsampling.BootstrapFromEach(t * 10)
query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
bootstrapped = True
print "Bootstrap: %s " % bt.__class__.__name__
print
else:
query_index = student.pick_next(pool=pool, k=step_size)
query = pool.fixk[query_index] # query with k words
开发者ID:mramire8,项目名称:active,代码行数:66,代码来源:traintest.py
示例20: writebunchobj
return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj,file_obj)
file_obj.close()
# 1. 读取停用词表
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()
# 2. 导入分词后的词向量bunch对象
path = "test_word_bag/test_set.dat" # 词向量空间保存路径
bunch = readbunchobj(path)
# 3. 构建测试集tfidf向量空间
testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 导入训练集的词袋
trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
# 5. 使用TfidfVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为tf-idf矩阵,单独保存字典文件
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary
# 创建词袋的持久化
space_path = "test_word_bag/testspace.dat" # 词向量空间保存路径
writebunchobj(space_path,testspace)
print "test词向量空间创建成功!!!"
开发者ID:2297988468,项目名称:Chinese-Text-Classification,代码行数:31,代码来源:test_space.py
注:本文中的sklearn.datasets.base.Bunch类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论