本文整理汇总了Python中sklearn.decomposition.TruncatedSVD类的典型用法代码示例。如果您正苦于以下问题:Python TruncatedSVD类的具体用法?Python TruncatedSVD怎么用?Python TruncatedSVD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TruncatedSVD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: tfIDFeats
def tfIDFeats(ids,data):
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(data)
X = tfv.transform(data)
# Initialize SVD
svd = TruncatedSVD(n_components=350)
# Initialize the standard scaler
scl = StandardScaler( with_mean=False)
if X.shape[1]>350:
X = svd.fit_transform(X)
X = scl.fit_transform(X,ids)
if plotData:
X = PCA(n_components=2).fit_transform(X)
return (X,ids)
开发者ID:mostafaelaraby,项目名称:articles-clustering,代码行数:26,代码来源:clusterRelated.py
示例2: find_k
def find_k(self, rank=None, max_clusters=1, vertline=None):
if rank != None:
svd = TruncatedSVD(rank)
self.X = svd.fit_transform(self.X)
self.X = Normalizer(copy=False).fit_transform(self.X)
k_range = range(1, max_clusters)
clusters = [KMeans(n_clusters=k).fit(self.X) for k in k_range]
centroids = [cluster.cluster_centers_ for cluster in clusters]
k_cosine = [cdist(self.X, cent, metric='cosine') for cent in centroids]
dist = [np.min(k_cos, axis=1) for k_cos in k_cosine]
wcss = [sum(d[np.isnan(d) == False]**2) for d in dist] # Within cluster sum of squares
tss = sum(pdist(self.X)**2)/self.X.shape[0] # Total sum of squares
bss = tss - wcss # Explained variance
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 3)
plt.tight_layout()
ax1.set_title('BSS')
ax1.plot(np.arange(1, len(bss)+1), bss)
ax1.scatter(np.arange(1, len(bss)+1), bss)
ax2.set_title('WCSS')
ax2.plot(np.arange(1, len(wcss)+1), wcss)
ax2.scatter(np.arange(1, len(wcss)+1), wcss)
plt.axvline(vertline, c='red', alpha=0.75) if vertline != None else None
plt.show()
开发者ID:hugsnotpugs,项目名称:WhoReadsXKCD,代码行数:30,代码来源:ScreePlots.py
示例3: test_feature_union
def test_feature_union():
# basic sanity check for feature union
iris = load_iris()
X = iris.data
X -= X.mean(axis=0)
y = iris.target
svd = TruncatedSVD(n_components=2, random_state=0)
select = SelectKBest(k=1)
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit(X, y)
X_transformed = fs.transform(X)
assert_equal(X_transformed.shape, (X.shape[0], 3))
# check if it does the expected thing
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
# test if it also works for sparse input
# We use a different svd object to control the random_state stream
fs = FeatureUnion([("svd", svd), ("select", select)])
X_sp = sparse.csr_matrix(X)
X_sp_transformed = fs.fit_transform(X_sp, y)
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
# test setting parameters
fs.set_params(select__k=2)
assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
X_transformed = fs.fit_transform(X, y)
assert_equal(X_transformed.shape, (X.shape[0], 8))
开发者ID:Givonaldo,项目名称:scikit-learn,代码行数:33,代码来源:test_pipeline.py
示例4: train_manual
def train_manual():
with open("../data/f_hashtag_prediction/train_data_tweets_processed_0_to_500K.txt") as ftrain:
with open("../data/f_hashtag_prediction/test_data_tagged_processed_manual.txt") as ftest:
test_set = ftest.read().splitlines()
train_set = ftrain.read().splitlines()
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(min_df=5, max_df=500, max_features=None,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 4), use_idf=1, smooth_idf=1, sublinear_tf=1,
stop_words='english')
# vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(train_set)
print tfidf_matrix.shape
smatrix = vectorizer.transform(test_set)
print smatrix.shape
svd = TruncatedSVD(n_components=500, random_state=42)
svd.fit(tfidf_matrix)
truncated_train_svd = svd.transform(tfidf_matrix)
truncated_test_svd = svd.transform(smatrix)
print truncated_train_svd.shape
print truncated_test_svd.shape
cosine = cosine_similarity(truncated_test_svd[0], truncated_train_svd)
print cosine
print "TEST SET: "
开发者ID:rudraksh125,项目名称:socialmedia,代码行数:29,代码来源:tfidf.py
示例5: cook
def cook():
x, y, weights = load_data()
n_components = 200
svd = TruncatedSVD(n_components, random_state=42)
x_unweighted = svd.fit_transform(x)
x_weighted = svd.fit_transform(weighted(x, weights))
for i in range(9):
frac = 1 - (i * 0.01 + 0.01)
print frac
x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Unweighted: ", classifier.score(x_test, y_test)
x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
print "Weighted: ", classifier.score(x_test, y_test)
print '--------------------------'
'''
开发者ID:wangchr,项目名称:eMeriL,代码行数:25,代码来源:cook.py
示例6: SVD_CV
def SVD_CV(counts, scores, n_comp=range(10,611,100)):
n_avg = 16
avg_err = []
for n in range(0,n_avg):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(counts, scores, \
test_size=0.2, random_state=n)
test_err = []
for n in n_comp:
TruncTrans = TruncatedSVD(n_components=n)
X_trunc_train = TruncTrans.fit_transform(X_train,scores)
regr = linear_model(X_trunc_train,y_train)
X_trunc_test = TruncTrans.transform(X_test)
y_pred = regr.predict(X_trunc_test)*10**(-12)+3
test_err.append(metrics.mean_squared_error(y_test, y_pred))
if not avg_err:
avg_err = test_err
else:
avg_err = [avg_err[i]+(test_err[i]*(1.0/n_avg)) for i in range(0,len(test_err))]
plt.plot(n_comp, avg_err, label='Out-of-Sample Error')
plt.xlabel('n components')
plt.ylabel('MSE')
plt.show()
开发者ID:kacunningham413,项目名称:MetaShoeReview,代码行数:28,代码来源:Metric_Models.py
示例7: kfold
def kfold(agetext,k,model,k2):
import collections
out = []
for i in range(k):
print "iteration: "+str(i)
agetext = shuffle(agetext)
datatb = agetext.iloc[:,1:]
label = agetext["agegroup"].tolist()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
datatb, label, test_size=0.15, random_state=i*6)
data = X_train.values
counter = collections.Counter(y_train)
print counter
testdata = X_test.values
lsa = TruncatedSVD(k2, algorithm = 'arpack')
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(data)
X = normalizer.fit_transform(X)
X_test = lsa.transform(testdata)
X_test = normalizer.transform(X_test)
model.fit(X,y_train)
pred = model.predict(X_test)
counter = collections.Counter(y_test)
print counter
counter = collections.Counter(pred)
print counter
out.append(round(accuracy_score(y_test, pred),5))
print str(out)
print np.mean(out)
开发者ID:hurelyyu,项目名称:CS_Master_UW,代码行数:29,代码来源:AgeGroup.py
示例8: test_sparse_formats
def test_sparse_formats(fmt):
Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)()
tsvd = TruncatedSVD(n_components=11)
Xtrans = tsvd.fit_transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
Xtrans = tsvd.transform(Xfmt)
assert_equal(Xtrans.shape, (n_samples, 11))
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:7,代码来源:test_truncated_svd.py
示例9: compute_svd
def compute_svd(Xs):
# compute 1st principal component
svd = TruncatedSVD(n_components=1, n_iter=20, random_state=0)
svd.fit(Xs)
pc = svd.components_
print(pc.shape, svd.explained_variance_ratio_)
return pc
开发者ID:andra-pumnea,项目名称:Thesis,代码行数:7,代码来源:weight_embeddings.py
示例10: lsa_summarizer
def lsa_summarizer(text,num_sen=5):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
sparse = tfvectorizer.fit_transform(sentenceTokens).A
lsa = TruncatedSVD(n_components=1)
concept = lsa.fit_transform(sparse)
pos = np.array(list(range(len(sentenceTokens))))
listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]
listlist.sort(key=lambda x: x[1],reverse=True)
summarysentences = listlist[0:num_sen]
summarysentences.sort(key=lambda x: x[2],reverse=False)
summary = ""
for n in range(num_sen):
summary += ' ' + summarysentences[n][0]
summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())
return summary
开发者ID:kenndanielso,项目名称:summarizer_app,代码行数:25,代码来源:summarizer.py
示例11: fit_document_matrix
def fit_document_matrix(self, X):
"""
Reduce dimension of sparse matrix X
using Latent Semantic Analysis and
build nearst neighbor model
Parameters
----------
X: sparse csr matrix, sparse term frequency matrix or
others weighting matrix from documents
"""
n_components = self.n_components
n_iter = self.n_iter
algorithm = self.algorithm
lsa_model = TruncatedSVD(n_components=n_components,
n_iter=n_iter,
algorithm=algorithm)
# reduce dimension using Latent Semantic Analysis
vectors = lsa_model.fit_transform(X)
self.vectors = vectors
# build nearest neighbor model
nbrs_model = build_nearest_neighbors(vectors, n_recommend=self.n_recommend)
self.nbrs_model = nbrs_model
return self
开发者ID:KarimJedda,项目名称:science_concierge,代码行数:26,代码来源:science_concierge.py
示例12: basic_lsi
def basic_lsi(df, n_components=200, max_df=0.5, min_df=5):
'''
Basic LSI model for album recommendations
Args:
df: dataframe with Pitchfork reviews
n_components: number of lsi dimensions
max_df: max_df in TfidfVectorizer
min_df: min_df in TfidfVectorizer
Returns:
tfidf: sklearn fitted TfidfVectorizer
tfidf_trans: sparse matrix with tfidf transformed data
svd: sklearn fitted TruncatedSVD
svd_trans: dense array with lsi transformed data
'''
X = df['review']
stopwords = nltk.corpus.stopwords.words('english')
tfidf = TfidfVectorizer(stop_words=stopwords,
max_df=max_df, min_df=min_df)
tfidf_trans = tfidf.fit_transform(X)
svd = TruncatedSVD(n_components=n_components)
svd_trans = svd.fit_transform(tfidf_trans)
return tfidf, tfidf_trans, svd, svd_trans
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:28,代码来源:eda.py
示例13: buildKB16
def buildKB16(n_comp = 200, seed_value = 123):
## data
# read the training/test data
print('Importing Data')
xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
xtest = pd.read_csv('../input/xtest_kb6099.csv')
# separate
id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)
# fit SVD
svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
svd.fit(xtrain)
xtrain = svd.transform(xtrain)
xtest = svd.transform(xtest)
xtrain = pd.DataFrame(xtrain)
xtest = pd.DataFrame(xtest)
## store the results
# add indices etc
xtrain = pd.DataFrame(xtrain)
xtrain['ID'] = id_train
xtrain['target'] = ytrain
#
xtest = pd.DataFrame(xtest)
xtest['ID'] = id_test
#
#
# # save the files
xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)
return
开发者ID:mpearmain,项目名称:bnp,代码行数:35,代码来源:build_datasets.py
示例14: truncatedSVD
def truncatedSVD(data, labels, new_dimension):
print "start truncatedSVD..."
start = time.time()
pca = TruncatedSVD(n_components=new_dimension)
reduced = pca.fit_transform(data)
end = time.time()
return (reduced, end-start)
开发者ID:sebastian-alfers,项目名称:master-thesis,代码行数:7,代码来源:dimensionality_reduction.py
示例15: test_inverse_transform
def test_inverse_transform(algo):
# We need a lot of components for the reconstruction to be "almost
# equal" in all positions. XXX Test means or sums instead?
tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
Xt = tsvd.fit_transform(X)
Xinv = tsvd.inverse_transform(Xt)
assert_array_almost_equal(Xinv, Xdense, decimal=1)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:7,代码来源:test_truncated_svd.py
示例16: main
def main():
infile = open(sys.argv[1])
outfile = sys.argv[2] #needs to be a string
vocabfile = open(sys.argv[3])
vocab = json.load(vocabfile)
F = sparse.lil_matrix((len(vdict), 4*len(vdict)), dtype=np.int32)
corpus_size = 0
lc = 0
for line in infile:
lc += 1
if lc % 10000 == 0:
print('processing line ' + str(lc) + ' at ' + str(datetime.datetime.now()))
words = line.split()
num_words = len(words)
corpus_size += num_words
if num_words < 5:
process_short_line(num_words, words, F, vocab)
else:
F[vocab[words[0]], 4 * vocab[words[1]] + 2] += 1
F[vocab[words[0]], 4 * vocab[words[2]] + 3] += 1
F[vocab[words[1]], 4 * vocab[words[0]] + 1] += 1
F[vocab[words[1]], 4 * vocab[words[2]] + 2] += 1
F[vocab[words[1]], 4 * vocab[words[3]] + 3] += 1
F[vocab[words[-2]], 4 * vocab[words[-4]] + 0] += 1
F[vocab[words[-2]], 4 * vocab[words[-3]] + 1] += 1
F[vocab[words[-2]], 4 * vocab[words[-1]] + 2] += 1
F[vocab[words[-1]], 4 * vocab[words[-3]] + 0] += 1
F[vocab[words[-1]], 4 * vocab[words[-2]] + 1] += 1
for i, word in enumerate(words[2:-2]):
F[vocab[word], 4 * vocab[words[i-2]] + 0] += 1
F[vocab[word], 4 * vocab[words[i-1]] + 1] += 1
F[vocab[word], 4 * vocab[words[i+1]] + 2] += 1
F[vocab[word], 4 * vocab[words[i+2]] + 3] += 1
# compute PMI
Fc = F.tocoo()
word_freqs = Fc.sum(1)
context_freqs = Fc.sum(0)
word_freqs = word_freqs.A1
context_freqs = context_freqs.A1
for i,j,v in zip(Fc.row, Fc.col, Fc.data):
F[i,j] = max( math.log((v * corpus_size) / (word_freqs[i] * context_freqs[j])), 0 )
# compute TruncatedSVD
svd = TruncatedSVD(n_components=200)
Fred = svd.fit_transform(F)
np.savetxt(outfile, Fred, delimiter=',')
infile.close()
vocabfile.close()
开发者ID:atmapersaud,项目名称:analogy-solver,代码行数:60,代码来源:explicit_vectorization.py
示例17: lsa
def lsa(BP, lentrain, n_components=16, preproc=True,
fit_area='test', min_df=3):
"""
aka Latent semantic analysis
"""
if preproc:
print "pre-processing data"
traindata = []
for observation in BP:
traindata.append(preprocess_pipeline(observation, "english",
"WordNetLemmatizer", True, True, False))
BP = traindata
print "fitting TfidfVectorizer"
tfv = TfidfVectorizer(min_df=min_df, max_features=None, strip_accents='unicode',
analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,
smooth_idf=1, sublinear_tf=1, norm='l2')
if fit_area == 'test':
tfv.fit(BP[lentrain:])
elif fit_area == 'train':
tfv.fit(BP[:lentrain])
else:
tfv.fit(BP)
print "transforming data"
BP = tfv.transform(BP)
print "BP(post):",BP.shape
if 1:
# svd here
print "use svd"
svd = TruncatedSVD(n_components=n_components, random_state=1)
BP = svd.fit_transform(BP)
return BP
开发者ID:orazaro,项目名称:stumbleupon_kaggle,代码行数:34,代码来源:prepare.py
示例18: TF_Transformer
class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
def __init__(self):
self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
self.tfidf_trans = TfidfTransformer()
self.SVD_trans = TruncatedSVD(n_components=300)
# X is a list of Fit_Review named tuples, y is none
def fit(self, X, y=None):
texts = [review.text for review in X]
counts = self.cv_bi.fit_transform(texts)
counts_tfidf = self.tfidf_trans.fit_transform(counts)
self.SVD_trans.fit(counts_tfidf)
return self
# X is a list of either Fit_Review or Prod_Corpus named tuples
def transform(self, X):
texts = [review.text for review in X]
counts = self.cv_bi.transform(texts)
counts_tfidf = self.tfidf_trans.transform(counts)
counts_trunc = self.SVD_trans.transform(counts_tfidf)
return counts_trunc
开发者ID:kacunningham413,项目名称:MetaShoeReview,代码行数:28,代码来源:Metric_Models.py
示例19: cluster_DBSCAN
def cluster_DBSCAN(args):
"""
Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
"""
#load data
g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
logger.info('Vectorizer: %s' % vec)
X = vec.transform(g_it, n_jobs = args.n_jobs)
logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
#project to lower dimensional space to use clustering algorithms
transformer = TruncatedSVD(n_components=args.n_components)
X_dense=transformer.fit_transform(X)
#log statistics on data
logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))
#clustering
clustering_algo = DBSCAN(eps = args.eps)
y = clustering_algo.fit_predict(X_dense)
msg = 'Predictions statistics: '
msg += util.report_base_statistics(y)
logger.info(msg)
#save model for vectorizer
out_file_name = "vectorizer"
eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
#save result
out_file_name = "labels"
eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
开发者ID:nickgentoo,项目名称:pyEDeN,代码行数:35,代码来源:cluster_DBSCAN.py
示例20: solve
def solve(self, X, missing_mask):
observed_mask = ~missing_mask
X_filled = X
for i in range(self.max_iters):
# deviation from original svdImpute algorithm:
# gradually increase the rank of our approximation
if self.gradual_rank_increase:
curr_rank = min(2 ** i, self.rank)
else:
curr_rank = self.rank
tsvd = TruncatedSVD(curr_rank, algorithm=self.svd_algorithm)
X_reduced = tsvd.fit_transform(X_filled)
X_reconstructed = tsvd.inverse_transform(X_reduced)
X_reconstructed = self.clip(X_reconstructed)
mae = masked_mae(
X_true=X,
X_pred=X_reconstructed,
mask=observed_mask)
if self.verbose:
print(
"[IterativeSVD] Iter %d: observed MAE=%0.6f" % (
i + 1, mae))
converged = self._converged(
X_old=X_filled,
X_new=X_reconstructed,
missing_mask=missing_mask)
X_filled[missing_mask] = X_reconstructed[missing_mask]
if converged:
break
return X_filled
开发者ID:GunnarEcon,项目名称:fancyimpute,代码行数:30,代码来源:iterative_svd.py
注:本文中的sklearn.decomposition.TruncatedSVD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论