本文整理汇总了Python中sklearn.metrics.pairwise.linear_kernel函数的典型用法代码示例。如果您正苦于以下问题:Python linear_kernel函数的具体用法?Python linear_kernel怎么用?Python linear_kernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了linear_kernel函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: pilot_test
def pilot_test():
"""
"""
users_vectors = []
vectorsums = []
for i, user in enumerate(sample_users):
df = pd.read_pickle('./fc8_100imgs_{}.pkl'.format(user))
users_vectors.append(df)
vectorsums.append(df.fc8.values.sum())
corpus = []
for vector in vectorsums:
corpus.append(vector_to_document(vector))
tfidf = TfidfVectorizer()
tfidf_vectorized = tfidf.fit_transform(corpus)
cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized)
new_docs = []
for i, user in enumerate(sample_users):
for j, img_vec in enumerate(users_vectors[i].fc8):
doc = vector_to_document(img_vec)
new_docs.append(doc)
# vectorized = tfidf.transform([doc])
# sims = linear_kernel(vectorized, tfidf_vectorized)[0]
# most_sims = np.argsort(sims)[::-1]
#
# print '{} img {} most similar to \n{}'.format(user, j, [(sample_users[i], sims[i]) for i in most_sims] )
new_docs_vectorized = tfidf.transform(new_docs)
cosine_similarities = linear_kernel(new_docs_vectorized, tfidf_vectorized)
for sim in cosine_similarities:
print 'top score: {} top user: {}'.format(sim.max(), sample_users[np.argmax(sim)])
开发者ID:theod07,项目名称:recommend-a-graham,代码行数:35,代码来源:tfidf_fc8.py
示例2: plot_hist_d_to_centroid
def plot_hist_d_to_centroid(self, min_w=0):
'''
histograms of distance to centroids: overall vs. each cluster
'''
self.assign_cluster(min_w)
self.cal_centroid()
n_clusters = np.max(self.clusters)
#fig = plt.figure(figsize=(20,8))
X2_dense = self.X2.todense()
centroid_overall = np.mean(X2_dense, axis=0)
sim = linear_kernel(centroid_overall, X2_dense)
max_sim = np.max(sim)
min_sim = np.min(sim)
# multiple plot, subplots
ncols = 3
nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
# subplot preferred way
fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
axs = ax.flatten()
i_plot = 0
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
for i in xrange(n_clusters):
cond = self.clusters == i
arr = X2_dense[cond]
sim = linear_kernel(self.centroids[i], arr)
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
plt.close(fig)
开发者ID:joyce-duan,项目名称:All-Things-Data-Science,代码行数:34,代码来源:topic_modeling.py
示例3: _build_similarity_matrix
def _build_similarity_matrix(self):
"""
partitioned similarity matrix ('s' for source nodes and 't' for target nodes)
S = [[S_ss, S_st],
[S_ts, S_tt]]
"""
normalize(self.source_features, norm='l2', copy=False)
normalize(self.target_features, norm='l2', copy=False)
self.ss = linear_kernel(self.source_features)
self.st = linear_kernel(self.source_features, self.target_features)
self.ts = self.st.T
self.tt = linear_kernel(self.target_features)
开发者ID:linhr,项目名称:dighub,代码行数:12,代码来源:graphs.py
示例4: plot_hist_d_to_centroid
def plot_hist_d_to_centroid(self, min_w=0):
'''
plot histogram of distance to centroid, overall vs. per cluster
- INPUT: self.X2
'''
self.assign_cluster(min_w)
self.cal_centroid()
n_clusters = np.max(self.clusters)
#fig = plt.figure(figsize=(20,8))
# multiple plot, subplots
ncols = 3
nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
# subplot preferred way
fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
axs = ax.flatten()
centroid_overall = np.mean(self.X2, axis=0)
sim = linear_kernel(centroid_overall, self.X2)
max_sim = np.max(sim)
min_sim = np.min(sim)
print 'sim shape: %s X shape: %s centroid_overall shape: %s' % (sim.shape, self.X2.shape, centroid_overall.shape)
print 'min %.2f max %.2f ' % (min_sim, max_sim)
print sorted(sim.flatten(), reverse=True)[:5]
print sorted(centroid_overall.getA().flatten(), reverse=True)[:5]
max_sim = 1
min_sim = 0
i_plot = 0
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
for i in xrange(n_clusters + 1):
cond = self.clusters == i
arr = self.X2[cond]
sim = linear_kernel(self.centroids[i], arr)
print 'sim shape: %s arr shape: %s centroid shape: %s' % (sim.shape, arr.shape, self.centroids[i].shape)
print sorted(sim.flatten(), reverse=True)[:5]
print sorted(self.centroids[i].flatten(), reverse=True)[:5]
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
plt.show()
fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
plt.close(fig)
开发者ID:joyce-duan,项目名称:All-Things-Data-Science,代码行数:49,代码来源:LDA_topics.py
示例5: main
def main():
twenty = fetch_20newsgroups()
tfidf = TfidfVectorizer().fit_transform(twenty.data)
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print related_docs_indices
print cosine_similarities[related_docs_indices]
# vectorizer = CountVectorizer(min_df=1)
# corpus = [
# 'This is the first document.',
# 'This is the second second document.',
# 'And the third one.',
# 'Is this the first document?',
# ]
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
# tfs = tfidf.fit_transform(token_dict.values())
train_set = ("The sky is blue.", "The sun is bright.")
test_set = ("The sun in the sky is bright.",
"We can see the shining sun, the bright sun.")
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(train_set)
print "Vocabulary:", count_vectorizer.vocabulary
# Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
freq_term_matrix = count_vectorizer.transform(test_set)
print freq_term_matrix.todense()
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
print "IDF:", tfidf.idf_
tf_idf_matrix = tfidf.transform(freq_term_matrix)
print tf_idf_matrix.todense()
开发者ID:zacheryschiller,项目名称:amr,代码行数:32,代码来源:scikitTest.py
示例6: __asyncable_similarity
def __asyncable_similarity(tup):
# bs, beer_id_ref, ref_vect, s_ids, b_ids, X_t, top = tup
# bs: beer similarity object for db commit
# ref_vects from one style
# ref_b_ids: beer ids for ref vecs
# s_ids, b_ids: style and beer indices of X_t
# X_t for beers in other styles to be compared to
# keep top similarities by style
bs, b_refs, X_t_ref, b_comps, X_t_comp, top = tup
start = dt.now()
print "Beer ct %s vs ct %s: Compute Similarity" % (len(b_refs), len(b_comps))
try:
for i in xrange(len(b_refs)):
# compute similarity between beer_ref[i] and all b_comps
lk = linear_kernel(X_t_ref.getrow(i), X_t_comp).flatten()
# take #top of largest similarities
n = len(lk)
kp = min(top, n)
m_ixs = lk.argsort()[-kp:]
sims = [(b_refs[i], b_comps[j], lk[j]) for j in m_ixs if b_refs[i] != b_comps[j]]
# bs.smooth_similarity(sims)
bs.add_many(sims)
print "Comparison Complete: %s" % (dt.now() - start)
return (b_refs, None)
except Exception as e:
return (b_refs, e)
开发者ID:datascientistone,项目名称:dataprojects,代码行数:34,代码来源:beersimilarity.py
示例7: __kernel_definition__
def __kernel_definition__(self):
if self.Kf == 'rbf':
return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
if self.Kf == 'poly':
return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
if self.Kf == None or self.Kf == 'linear':
return lambda X,Y : linear_kernel(X,Y)
开发者ID:LucaDemo,项目名称:EasyMKL,代码行数:7,代码来源:komd.py
示例8: thread_diag_block
def thread_diag_block(top_nbrs,dataM,job_ranges,r_offset, c_offset,
n_nbr=100,verbose=False):
''' (cos,idx)
Note in the min-heap, the first one is the smallest.
'''
for job_bd in job_ranges:
crossV = linear_kernel(dataM[job_bd[0]:job_bd[1],:],dataM)
n_doc1, n_doc2 = crossV.shape
for i_doc in range(n_doc1):
i_offset = i_doc + job_bd[0] + r_offset
L = top_nbrs[i_offset]
for j in range(n_doc2):
if i_offset == j+c_offset:
continue
if len(L)<n_nbr:
heapq.heappush(L, (crossV[i_doc,j],j+c_offset))
elif crossV[i_doc,j] > L[0][0]:
heapq.heapreplace(L, (crossV[i_doc,j],j+c_offset))
top_nbrs[i_offset] = L
if verbose:
print('process range (%d,%d)'%(job_bd[0],job_bd[1]))
开发者ID:changhw01,项目名称:TextMining,代码行数:27,代码来源:doc_processer.py
示例9: get_related_news
def get_related_news(articles ,base_art_index):
if related_dict.get(base_art_index) is not None :
return related_dict.get(base_art_index)
corpus = []
for art in articles :
corpus.append( ' '.join( jieba.cut(art.context) ) )
ls = [w for w in WordCutLibs.stopwords.split('\n')]
vectorizer = CountVectorizer(stop_words=ls)
X = vectorizer.fit_transform(corpus)
#word = vectorizer.get_feature_names()
#stopword = vectorizer.get_stop_words()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
#weight = tfidf.toarray()
target = base_art_index #設定目標標題 #index順序同SQL
cosine_similarities = linear_kernel(tfidf[target], tfidf).flatten().argsort()
max_len = len(cosine_similarities)
bnd = -11 if max_len >= 10 else -(max_len)
related_docs_indices = cosine_similarities[: bnd:-1]
res = [ articles[idx] for idx in related_docs_indices ]
related_dict[base_art_index] = res
return res
开发者ID:HRTseng,项目名称:III,代码行数:31,代码来源:HotIssueGroup.py
示例10: print_most_cos_sim
def print_most_cos_sim(self, thresh=0.675):
'''
Prints the two posts that have the highest cosine similarity
'''
cos_sims = linear_kernel(self.word_vecs, self.word_vecs)
# Initialize max_sim = 0, only consider cos sims under threshold
# so we know we're not recording a post compared with itself (1.0)
max_cos_sim = 0.0
thr = thresh
# Find max_cos_sim
for i, j in enumerate(cos_sims):
for k, l in enumerate(j):
if (float(l) >= max_cos_sim) and (float(l) < thr):
max_cos_sim = float(l)
# Find indices of max_cos_sim
double_break = False
for i, j in enumerate(cos_sims):
for k, l in enumerate(j):
if float(l) == max_cos_sim:
ind1, ind2 = i, k
double_break = True
break
if double_break:
break
print 'Posts with highest cosine similarity ({:.3f}):\n\nPost {}:\n{}\
\n\nPost {}:\n{}'.format(max_cos_sim, ind1, self.posts[ind1],
ind2, self.posts[ind2])
开发者ID:stong1108,项目名称:CL_missedconn,代码行数:31,代码来源:kMeansPosts.py
示例11: __init__
def __init__(self, *args, **kwargs):
super(QUIRE, self).__init__(*args, **kwargs)
self.Uindex = [idx for idx, _ in self.dataset.get_unlabeled_entries()]
self.Lindex = [idx for idx in range(len(self.dataset)) if idx not in self.Uindex]
self.lmbda = kwargs.pop("lambda", 1.0)
X, self.y = zip(*self.dataset.get_entries())
self.y = list(self.y)
self.kernel = kwargs.pop("kernel", "rbf")
if self.kernel == "rbf":
self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop("gamma", 1.0))
elif self.kernel == "poly":
self.K = polynomial_kernel(
X=X, Y=X, coef0=kwargs.pop("coef0", 1), degree=kwargs.pop("degree", 3), gamma=kwargs.pop("gamma", 1.0)
)
elif self.kernel == "linear":
self.K = linear_kernel(X=X, Y=X)
elif hasattr(self.kernel, "__call__"):
self.K = self.kernel(X=np.array(X), Y=np.array(X))
else:
raise NotImplementedError
if not isinstance(self.K, np.ndarray):
raise TypeError("K should be an ndarray")
if self.K.shape != (len(X), len(X)):
raise ValueError("kernel should have size (%d, %d)" % (len(X), len(X)))
self.L = np.linalg.inv(self.K + self.lmbda * np.eye(len(X)))
开发者ID:ckbjimmy,项目名称:libact,代码行数:26,代码来源:quire.py
示例12: get
def get(self):
query = self.get_argument('q', None)
if query is None:
return
queryTerms = query.split()
# let's say we have N documents and M terms in query
# Apparently we assume unique term in query
# queryVector is a 1 * M dimension array
queryVector = np.array([self._logIDF[term] for term in queryTerms])
# docVectoDict is a N * M vector, with default value np.array([0] * M)
docVectorDict = defaultdict(lambda: np.array([0]*len(queryTerms)))
for i in range(len(queryTerms)):
term = queryTerms[i].lower()
newList = self._postingsList[term]
for item in newList: # newList is [(docID,tf)]
docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
docMatrix = np.zeros((len(docVectoDict)), len(queryTerms)))
docIx = 0
docIxToDocID = {}
for docID in docVectorDict.keys():
docMatrix[docIx][:] = docVectorDict[docID][:]
docIxToDocID[docIx] = docID
docIx += 1
# linear_kernel is used to compute the similarity
sims = linear_kernel(queryVector,docMatrix).flatten()
# argsort return the index
bestDocIxes = sims.argsort()[::-1]
bestDocSims = sims[bestDocIxes]
bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
postings = zip(bestDocIDs, bestDocSims)
self.write(json.dumps({"postings":postings}))
开发者ID:YuanyiYang,项目名称:NYUCourse,代码行数:32,代码来源:index.py
示例13: predict
def predict(data, vect, user_list, tweet_list, word_counts):
vector = vect.transform(data)
result_matrix = linear_kernel(vector, word_counts)
indices_of_tweets = []
# For each tweet by the client, find the 30 most similar tweets
# This list may include tweets by the client
for row in result_matrix:
indices = row.argsort()[:][::-1]
indices_of_tweets.append(indices[2:51])
# Return the person that tweeted each of the 50 most similar tweets
user_array = np.array(user_list)
persons_per_tweet = []
for row in indices_of_tweets:
persons_per_tweet.append(user_array[row])
# Count up how many times each person shows up.
# Same weighting is given to people who have many tweets similar to one client tweet
# and a tweet that matches a high number of client tweets.
persons_counter = Counter()
for row in persons_per_tweet:
persons_counter.update(row)
# return the top 25 people in this list
top_people_and_count = persons_counter.most_common(25)
top_people = [tup[0] for tup in top_people_and_count]
return top_people
开发者ID:susiexsun,项目名称:capstone_project,代码行数:34,代码来源:tweetdoc_predict2.py
示例14: getRelevantPassages
def getRelevantPassages(query, k):
queryVector = allTextVectorizer.transform([query])
queryIndices = numpy.array([allTextVectorizer.vocabulary_.get(word) for word in allTextAnalyzer(query)])
queryIndices = [i for i in queryIndices if i is not None]
querySimilarityScores = linear_kernel(queryVector[:,queryIndices], allTextIndex[:,queryIndices]).flatten()
relatedDocIndices = querySimilarityScores.argsort()[:-k:-1]
return [allTextLines[i] for i in relatedDocIndices]
开发者ID:Jnanayogi33,项目名称:ScienceQA,代码行数:7,代码来源:QAUtils.py
示例15: get_results
def get_results(query):
test = query
response = tfidf.transform([test])
print 'response: ', response
RESULTS_ARRAY = []
cosine_similarities = linear_kernel(response, tfs).flatten()
related_docs_indices = cosine_similarities.argsort()[:-10:-1]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
file_name = token_dict.keys()[i].split('.')[0] + '.pdf.html.json'
data = {}
data = summary_dict[file_name]
data.update({"candidate": token_dict.keys()[i].split('.')[0],
"cosine": cosine_similarities[i]})
# data = {"candidate": token_dict.keys()[i].split('.')[0],
# "cosine": cosine_similarities[i]}
RESULTS_ARRAY.append(data)
# print "%-50s %.4f" % (token_dict.keys()[i].split('.')[0],cosine_similarities[i])
# print RESULTS_ARRAY
return RESULTS_ARRAY
开发者ID:gvishal,项目名称:Sematic-Job-Recommendation-Engine,代码行数:26,代码来源:generic_search.py
示例16: main
def main(protein_dict,q1,q2):
if not protein_dict:
angle_list = ['9999']
elif len(protein_dict) < 2:
angle_list = ['9999']
else:
train_set = [' '.join(protein_dict[x]) for x in protein_dict]
proteins = [x for x in protein_dict]
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(train_set) #finds the tfidf score with normalization
# print 'tfidf[0:1]', tfidf[0:1]
# print 'tfidf[0:2]', tfidf[0:2]
# print 'tfidf', tfidf
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
# print 'cosine_similarities', cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
degrees_list = []
for a in (cosine_similarities[related_docs_indices].tolist()):
try:
angle_list = []
angle_in_radians = math.acos(a)
angle_in_degrees = math.degrees(angle_in_radians)
degrees_list.append(angle_in_degrees)
angle_list.append(angle_in_degrees)
except ValueError:
angle_list = ['9999']
if len(degrees_list) > 1:
return_list = [degrees_list[1]]
else:
return_list = degrees_list
return return_list
开发者ID:rothadamg,项目名称:UPSITE,代码行数:35,代码来源:Cosine_Sim.py
示例17: _apply_kernel
def _apply_kernel(self, x, y):
"""Apply the selected kernel function to the data."""
if self.kernel == 'linear':
phi = linear_kernel(x, y)
elif self.kernel == 'rbf':
phi = rbf_kernel(x, y, self.coef1)
elif self.kernel == 'poly':
phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0)
elif callable(self.kernel):
phi = self.kernel(x, y)
if len(phi.shape) != 2:
raise ValueError(
"Custom kernel function did not return 2D matrix"
)
if phi.shape[0] != x.shape[0]:
raise ValueError(
"Custom kernel function did not return matrix with rows"
" equal to number of data points."""
)
else:
raise ValueError("Kernel selection is invalid.")
if self.bias_used:
phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1)
return phi
开发者ID:SuixueWang,项目名称:scikit-rvm,代码行数:26,代码来源:rvm.py
示例18: sim_char5
def sim_char5(text1, text2):
vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english')
texts = [text1, text2]
matrix = vect.transform(texts)
cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
simmax = max(cosine_similarities[1:])
return simmax
开发者ID:softlang,项目名称:wikionto,代码行数:7,代码来源:seed_sim.py
示例19: get
def get(self):
query = self.get_argument('q', None)
if query is None:
return
queryTerms = query.split()
queryVector = np.array([self._logIDF[term] for term in queryTerms])
docVectorDict = defaultdict(lambda: np.array([0] * len(queryTerms)))
for i in range(len(queryTerms)):
term = queryTerms[i].lower()
newList = self._postingsLists[term]
for item in newList:
docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
docMatrix = np.zeros((len(docVectorDict), len(queryTerms)))
docIx = 0
docIxToDocID = {}
for docID in docVectorDict.keys():
docMatrix[docIx][:] = docVectorDict[docID][:]
docIxToDocID[docIx] = docID
docIx += 1
sims = linear_kernel(queryVector, docMatrix).flatten()
bestDocIxes = sims.argsort()[::-1]
bestDocSims = sims[bestDocIxes]
bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
postings = zip(bestDocIDs, bestDocSims)
self.write(json.dumps({"postings": postings}))
开发者ID:YuanyiYang,项目名称:NYUCourse,代码行数:25,代码来源:index.py
示例20: _train
def _train(self, ds):
"""
Train the engine.
Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. The 'stop_words' param
tells the TF-IDF module to ignore common english words like 'the', etc.
Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is
equivalent to cosine similarity).
Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well...
how many similar products do you really need to show?
Similarities and their scores are stored in redis as a Sorted Set, with one set for each item.
:param ds: A pandas dataset containing two fields: description & id
:return: Nothin!
"""
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['content'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
for idx, row in ds.iterrows():
similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]
# First item is the item itself, so remove it.
# This 'sum' is turns a list of tuples into a single tuple: [(1,2), (3,4)] -> (1,2,3,4)
flattened = sum(similar_items[1:], ())
self._r.zadd(self.SIMKEY % row['id'], *flattened)
开发者ID:qqqwey941008,项目名称:wechat_web_scraper,代码行数:31,代码来源:engines.py
注:本文中的sklearn.metrics.pairwise.linear_kernel函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论