• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pairwise.linear_kernel函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.metrics.pairwise.linear_kernel函数的典型用法代码示例。如果您正苦于以下问题:Python linear_kernel函数的具体用法?Python linear_kernel怎么用?Python linear_kernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了linear_kernel函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: pilot_test

def pilot_test():
	"""
	"""
	users_vectors = []
	vectorsums = []
	for i, user in enumerate(sample_users):
		df = pd.read_pickle('./fc8_100imgs_{}.pkl'.format(user))
		users_vectors.append(df)
		vectorsums.append(df.fc8.values.sum())

	corpus = []
	for vector in vectorsums:
		corpus.append(vector_to_document(vector))

	tfidf = TfidfVectorizer()
	tfidf_vectorized = tfidf.fit_transform(corpus)

	cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized)

	new_docs = []
	for i, user in enumerate(sample_users):
		for j, img_vec in enumerate(users_vectors[i].fc8):
			doc = vector_to_document(img_vec)
			new_docs.append(doc)
			# vectorized = tfidf.transform([doc])
			# sims = linear_kernel(vectorized, tfidf_vectorized)[0]
			# most_sims = np.argsort(sims)[::-1]
			#
			# print '{} img {} most similar to \n{}'.format(user, j, [(sample_users[i], sims[i]) for i in most_sims] )

	new_docs_vectorized = tfidf.transform(new_docs)
	cosine_similarities = linear_kernel(new_docs_vectorized, tfidf_vectorized)

	for sim in cosine_similarities:
		print 'top score: {}     top user: {}'.format(sim.max(), sample_users[np.argmax(sim)])
开发者ID:theod07,项目名称:recommend-a-graham,代码行数:35,代码来源:tfidf_fc8.py


示例2: plot_hist_d_to_centroid

    def plot_hist_d_to_centroid(self, min_w=0):
        '''
        histograms of distance to centroids: overall vs. each cluster
        '''
        self.assign_cluster(min_w)
        self.cal_centroid()
        n_clusters = np.max(self.clusters)
        #fig = plt.figure(figsize=(20,8))
        X2_dense = self.X2.todense()
        centroid_overall = np.mean(X2_dense, axis=0)
        sim = linear_kernel(centroid_overall, X2_dense)
        max_sim = np.max(sim)
        min_sim = np.min(sim)

        # multiple plot, subplots
        ncols = 3
        nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
        # subplot preferred way
        fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
        axs = ax.flatten()
        i_plot = 0
        axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
        axs[i_plot].set_xlim(min_sim, max_sim)
        i_plot = i_plot + 1

        for i in xrange(n_clusters):
            cond = self.clusters == i
            arr = X2_dense[cond]
            sim = linear_kernel(self.centroids[i], arr)
            axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
            axs[i_plot].set_xlim(min_sim, max_sim)
            i_plot = i_plot + 1
        fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
        plt.close(fig)
开发者ID:joyce-duan,项目名称:All-Things-Data-Science,代码行数:34,代码来源:topic_modeling.py


示例3: _build_similarity_matrix

 def _build_similarity_matrix(self):
     """
     partitioned similarity matrix ('s' for source nodes and 't' for target nodes)
     S = [[S_ss, S_st],
          [S_ts, S_tt]]
     """
     normalize(self.source_features, norm='l2', copy=False)
     normalize(self.target_features, norm='l2', copy=False)
     self.ss = linear_kernel(self.source_features)
     self.st = linear_kernel(self.source_features, self.target_features)
     self.ts = self.st.T
     self.tt = linear_kernel(self.target_features)
开发者ID:linhr,项目名称:dighub,代码行数:12,代码来源:graphs.py


示例4: plot_hist_d_to_centroid

    def plot_hist_d_to_centroid(self, min_w=0):
        '''
        plot histogram of distance to centroid, overall vs. per cluster
                - INPUT: self.X2
        '''
        self.assign_cluster(min_w)
        self.cal_centroid()
        n_clusters = np.max(self.clusters)
        #fig = plt.figure(figsize=(20,8))

        # multiple plot, subplots
        ncols = 3
        nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
        # subplot preferred way
        fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
        axs = ax.flatten()

        centroid_overall = np.mean(self.X2, axis=0)
        sim = linear_kernel(centroid_overall, self.X2)
        max_sim = np.max(sim)
        min_sim = np.min(sim)
        print 'sim shape: %s  X shape: %s centroid_overall shape: %s' % (sim.shape, self.X2.shape, centroid_overall.shape)
        print 'min %.2f max %.2f ' % (min_sim, max_sim)
        print sorted(sim.flatten(), reverse=True)[:5]
        print sorted(centroid_overall.getA().flatten(), reverse=True)[:5]

        max_sim = 1
        min_sim = 0

        i_plot = 0
        axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
        axs[i_plot].set_xlim(min_sim, max_sim)
        i_plot = i_plot + 1

        for i in xrange(n_clusters + 1):
            cond = self.clusters == i
            arr = self.X2[cond]
            sim = linear_kernel(self.centroids[i], arr)
            print 'sim shape: %s  arr shape: %s  centroid shape: %s' % (sim.shape, arr.shape, self.centroids[i].shape)
            print sorted(sim.flatten(), reverse=True)[:5]
            print sorted(self.centroids[i].flatten(), reverse=True)[:5]
            axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
            axs[i_plot].set_xlim(min_sim, max_sim)
            i_plot = i_plot + 1

        plt.show()
        fig.savefig(self.model_name + '_hist_dis_to_centroid.png')

        plt.close(fig)
开发者ID:joyce-duan,项目名称:All-Things-Data-Science,代码行数:49,代码来源:LDA_topics.py


示例5: main

def main():
    twenty = fetch_20newsgroups()
    tfidf = TfidfVectorizer().fit_transform(twenty.data)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print related_docs_indices
    print cosine_similarities[related_docs_indices]
    # vectorizer = CountVectorizer(min_df=1)
    # corpus = [
    # 'This is the first document.',
    # 'This is the second second document.',
    # 'And the third one.',
    # 'Is this the first document?',
    # ]

    # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    # tfs = tfidf.fit_transform(token_dict.values())

    train_set = ("The sky is blue.", "The sun is bright.")
    test_set = ("The sun in the sky is bright.",
                "We can see the shining sun, the bright sun.")
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform(train_set)
    print "Vocabulary:", count_vectorizer.vocabulary
    # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
    freq_term_matrix = count_vectorizer.transform(test_set)
    print freq_term_matrix.todense()
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    print "IDF:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    print tf_idf_matrix.todense()
开发者ID:zacheryschiller,项目名称:amr,代码行数:32,代码来源:scikitTest.py


示例6: __asyncable_similarity

def __asyncable_similarity(tup):

    #  bs, beer_id_ref, ref_vect, s_ids, b_ids, X_t, top = tup

    # bs: beer similarity object for db commit
    # ref_vects from one style
    # ref_b_ids: beer ids for ref vecs
    # s_ids, b_ids: style and beer indices of X_t
    # X_t for beers in other styles to be compared to
    # keep top similarities by style
    bs, b_refs, X_t_ref, b_comps, X_t_comp, top = tup

    start = dt.now()
    print "Beer ct %s vs ct %s: Compute Similarity" % (len(b_refs), len(b_comps))
    try:
        for i in xrange(len(b_refs)):

            # compute similarity between beer_ref[i] and all b_comps
            lk = linear_kernel(X_t_ref.getrow(i), X_t_comp).flatten()

            # take #top of largest similarities
            n = len(lk)
            kp = min(top, n)
            m_ixs = lk.argsort()[-kp:]

            sims = [(b_refs[i], b_comps[j], lk[j]) for j in m_ixs if b_refs[i] != b_comps[j]]

            # bs.smooth_similarity(sims)
            bs.add_many(sims)

        print "Comparison Complete: %s" % (dt.now() - start)
        return (b_refs, None)
    except Exception as e:
        return (b_refs, e)
开发者ID:datascientistone,项目名称:dataprojects,代码行数:34,代码来源:beersimilarity.py


示例7: __kernel_definition__

 def __kernel_definition__(self):
     if self.Kf == 'rbf':
         return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
     if self.Kf == 'poly':
         return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
     if self.Kf == None or self.Kf == 'linear':
         return lambda X,Y : linear_kernel(X,Y)
开发者ID:LucaDemo,项目名称:EasyMKL,代码行数:7,代码来源:komd.py


示例8: thread_diag_block

def thread_diag_block(top_nbrs,dataM,job_ranges,r_offset, c_offset,
                    n_nbr=100,verbose=False):
    
    ''' (cos,idx) 
        Note in the min-heap, the first one is the smallest.
    '''

    for job_bd in job_ranges:
        crossV = linear_kernel(dataM[job_bd[0]:job_bd[1],:],dataM)
        n_doc1, n_doc2 = crossV.shape
        
        for i_doc in range(n_doc1):
            i_offset = i_doc + job_bd[0] + r_offset
            L = top_nbrs[i_offset]
            for j in range(n_doc2):            
                if i_offset == j+c_offset:
                    continue

                if len(L)<n_nbr:
                    heapq.heappush(L, (crossV[i_doc,j],j+c_offset))
                elif crossV[i_doc,j] > L[0][0]:
                    heapq.heapreplace(L, (crossV[i_doc,j],j+c_offset))
        
            top_nbrs[i_offset] = L

        if verbose:
            print('process range (%d,%d)'%(job_bd[0],job_bd[1]))
开发者ID:changhw01,项目名称:TextMining,代码行数:27,代码来源:doc_processer.py


示例9: get_related_news

def get_related_news(articles ,base_art_index):
    
    if related_dict.get(base_art_index) is not None :
        return related_dict.get(base_art_index)
    
    corpus = []
    for art in articles :
        corpus.append( ' '.join( jieba.cut(art.context) ) )
    ls = [w for w in  WordCutLibs.stopwords.split('\n')]
    vectorizer = CountVectorizer(stop_words=ls)
    X = vectorizer.fit_transform(corpus)
    #word = vectorizer.get_feature_names()
    #stopword = vectorizer.get_stop_words()
    
    
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    #weight = tfidf.toarray()
    
    target = base_art_index #設定目標標題     #index順序同SQL
    
    
    cosine_similarities = linear_kernel(tfidf[target], tfidf).flatten().argsort()
    
    max_len = len(cosine_similarities)
    bnd = -11 if max_len >= 10 else -(max_len)
    related_docs_indices = cosine_similarities[: bnd:-1]
    
    res = [ articles[idx] for idx in related_docs_indices ]
    related_dict[base_art_index] = res
    return res
    
开发者ID:HRTseng,项目名称:III,代码行数:31,代码来源:HotIssueGroup.py


示例10: print_most_cos_sim

    def print_most_cos_sim(self, thresh=0.675):
        '''
        Prints the two posts that have the highest cosine similarity
        '''
        cos_sims = linear_kernel(self.word_vecs, self.word_vecs)

        # Initialize max_sim = 0, only consider cos sims under threshold
        # so we know we're not recording a post compared with itself (1.0)
        max_cos_sim = 0.0
        thr = thresh

        # Find max_cos_sim
        for i, j in enumerate(cos_sims):
            for k, l in enumerate(j):
                if (float(l) >= max_cos_sim) and (float(l) < thr):
                    max_cos_sim = float(l)

        # Find indices of max_cos_sim
        double_break = False
        for i, j in enumerate(cos_sims):
            for k, l in enumerate(j):
                if float(l) == max_cos_sim:
                    ind1, ind2 = i, k
                    double_break = True
                    break
            if double_break:
                break

        print 'Posts with highest cosine similarity ({:.3f}):\n\nPost {}:\n{}\
            \n\nPost {}:\n{}'.format(max_cos_sim, ind1, self.posts[ind1],
                                    ind2, self.posts[ind2])
开发者ID:stong1108,项目名称:CL_missedconn,代码行数:31,代码来源:kMeansPosts.py


示例11: __init__

    def __init__(self, *args, **kwargs):
        super(QUIRE, self).__init__(*args, **kwargs)
        self.Uindex = [idx for idx, _ in self.dataset.get_unlabeled_entries()]
        self.Lindex = [idx for idx in range(len(self.dataset)) if idx not in self.Uindex]
        self.lmbda = kwargs.pop("lambda", 1.0)
        X, self.y = zip(*self.dataset.get_entries())
        self.y = list(self.y)
        self.kernel = kwargs.pop("kernel", "rbf")
        if self.kernel == "rbf":
            self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop("gamma", 1.0))
        elif self.kernel == "poly":
            self.K = polynomial_kernel(
                X=X, Y=X, coef0=kwargs.pop("coef0", 1), degree=kwargs.pop("degree", 3), gamma=kwargs.pop("gamma", 1.0)
            )
        elif self.kernel == "linear":
            self.K = linear_kernel(X=X, Y=X)
        elif hasattr(self.kernel, "__call__"):
            self.K = self.kernel(X=np.array(X), Y=np.array(X))
        else:
            raise NotImplementedError

        if not isinstance(self.K, np.ndarray):
            raise TypeError("K should be an ndarray")
        if self.K.shape != (len(X), len(X)):
            raise ValueError("kernel should have size (%d, %d)" % (len(X), len(X)))
        self.L = np.linalg.inv(self.K + self.lmbda * np.eye(len(X)))
开发者ID:ckbjimmy,项目名称:libact,代码行数:26,代码来源:quire.py


示例12: get

    def get(self):
        query = self.get_argument('q', None)
        if query is None:
            return
        queryTerms = query.split()
        # let's say we have N documents and M terms in query
        # Apparently we assume unique term in query
        # queryVector is a 1 * M dimension array
        queryVector = np.array([self._logIDF[term] for term in queryTerms])
        # docVectoDict is a N * M vector, with default value np.array([0] * M)
        docVectorDict = defaultdict(lambda: np.array([0]*len(queryTerms)))

        for i in range(len(queryTerms)):
            term = queryTerms[i].lower()
            newList = self._postingsList[term]
            for item in newList:  # newList is [(docID,tf)]
                docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
        docMatrix = np.zeros((len(docVectoDict)), len(queryTerms)))
        docIx = 0
        docIxToDocID = {}
        for docID in docVectorDict.keys():
            docMatrix[docIx][:] = docVectorDict[docID][:]
            docIxToDocID[docIx] = docID
            docIx += 1
        # linear_kernel is used to compute the similarity
        sims = linear_kernel(queryVector,docMatrix).flatten()
        # argsort return the index 
        bestDocIxes = sims.argsort()[::-1]
        bestDocSims = sims[bestDocIxes]
        bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
        postings = zip(bestDocIDs, bestDocSims)
        self.write(json.dumps({"postings":postings}))
开发者ID:YuanyiYang,项目名称:NYUCourse,代码行数:32,代码来源:index.py


示例13: predict

def predict(data, vect, user_list, tweet_list, word_counts): 
	vector = vect.transform(data)
	result_matrix = linear_kernel(vector, word_counts)
	
	indices_of_tweets = []

	# For each tweet by the client, find the 30 most similar tweets
	# This list may include tweets by the client
	for row in result_matrix: 
		indices = row.argsort()[:][::-1]
		indices_of_tweets.append(indices[2:51])


	# Return the person that tweeted each of the 50 most similar tweets
	user_array = np.array(user_list)
	persons_per_tweet = []

	for row in indices_of_tweets: 
		persons_per_tweet.append(user_array[row])

	# Count up how many times each person shows up. 
	# Same weighting is given to people who have many tweets similar to one client tweet
	# and a tweet that matches a high number of client tweets.
	persons_counter = Counter()

	for row in persons_per_tweet: 
		persons_counter.update(row)

	# return the top 25 people in this list
	top_people_and_count = persons_counter.most_common(25)

	top_people = [tup[0] for tup in top_people_and_count]

	return top_people
开发者ID:susiexsun,项目名称:capstone_project,代码行数:34,代码来源:tweetdoc_predict2.py


示例14: getRelevantPassages

def getRelevantPassages(query, k):
    queryVector = allTextVectorizer.transform([query])
    queryIndices = numpy.array([allTextVectorizer.vocabulary_.get(word) for word in allTextAnalyzer(query)])
    queryIndices = [i for i in queryIndices if i is not None]
    querySimilarityScores = linear_kernel(queryVector[:,queryIndices], allTextIndex[:,queryIndices]).flatten()
    relatedDocIndices = querySimilarityScores.argsort()[:-k:-1]
    return [allTextLines[i] for i in relatedDocIndices]
开发者ID:Jnanayogi33,项目名称:ScienceQA,代码行数:7,代码来源:QAUtils.py


示例15: get_results

def get_results(query):

    test = query
    response = tfidf.transform([test])

    print 'response: ', response

    RESULTS_ARRAY = []

    cosine_similarities = linear_kernel(response, tfs).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-10:-1]
    for i in related_docs_indices:
        if cosine_similarities[i] > 0:
            file_name = token_dict.keys()[i].split('.')[0] + '.pdf.html.json'
            data = {}
            data = summary_dict[file_name]
            data.update({"candidate": token_dict.keys()[i].split('.')[0],
                            "cosine": cosine_similarities[i]})
            # data = {"candidate": token_dict.keys()[i].split('.')[0],
            #                 "cosine": cosine_similarities[i]}

            RESULTS_ARRAY.append(data)
            # print "%-50s %.4f" % (token_dict.keys()[i].split('.')[0],cosine_similarities[i])

    # print RESULTS_ARRAY
    return RESULTS_ARRAY
开发者ID:gvishal,项目名称:Sematic-Job-Recommendation-Engine,代码行数:26,代码来源:generic_search.py


示例16: main

def main(protein_dict,q1,q2):
    if not protein_dict:
        angle_list = ['9999']
    elif len(protein_dict) < 2:
        angle_list = ['9999']
    else:
        train_set = [' '.join(protein_dict[x]) for x in protein_dict]
        proteins = [x for x in protein_dict]        
        tfidf_vectorizer = TfidfVectorizer()
        tfidf = tfidf_vectorizer.fit_transform(train_set)  #finds the tfidf score with normalization
#        print 'tfidf[0:1]', tfidf[0:1]
#        print 'tfidf[0:2]', tfidf[0:2]
#        print 'tfidf', tfidf
        cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
#        print 'cosine_similarities', cosine_similarities
        related_docs_indices = cosine_similarities.argsort()[:-5:-1]

        degrees_list = []
        for a in (cosine_similarities[related_docs_indices].tolist()):

            try:
                angle_list = []
                angle_in_radians = math.acos(a)
                angle_in_degrees = math.degrees(angle_in_radians)
                degrees_list.append(angle_in_degrees) 
                angle_list.append(angle_in_degrees)   
            except ValueError:
                angle_list = ['9999']
                
        if len(degrees_list) > 1:
            return_list = [degrees_list[1]]
        else:
            return_list = degrees_list

        return return_list   
开发者ID:rothadamg,项目名称:UPSITE,代码行数:35,代码来源:Cosine_Sim.py


示例17: _apply_kernel

    def _apply_kernel(self, x, y):
        """Apply the selected kernel function to the data."""
        if self.kernel == 'linear':
            phi = linear_kernel(x, y)
        elif self.kernel == 'rbf':
            phi = rbf_kernel(x, y, self.coef1)
        elif self.kernel == 'poly':
            phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0)
        elif callable(self.kernel):
            phi = self.kernel(x, y)
            if len(phi.shape) != 2:
                raise ValueError(
                    "Custom kernel function did not return 2D matrix"
                )
            if phi.shape[0] != x.shape[0]:
                raise ValueError(
                    "Custom kernel function did not return matrix with rows"
                    " equal to number of data points."""
                )
        else:
            raise ValueError("Kernel selection is invalid.")

        if self.bias_used:
            phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1)

        return phi
开发者ID:SuixueWang,项目名称:scikit-rvm,代码行数:26,代码来源:rvm.py


示例18: sim_char5

def sim_char5(text1, text2):
    vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english')
    texts = [text1, text2]
    matrix = vect.transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
开发者ID:softlang,项目名称:wikionto,代码行数:7,代码来源:seed_sim.py


示例19: get

	def get(self):		
		query = self.get_argument('q', None)
		if query is None:
			return
		queryTerms = query.split()
		queryVector = np.array([self._logIDF[term] for term in queryTerms])
		docVectorDict = defaultdict(lambda: np.array([0] * len(queryTerms)))
		for i in range(len(queryTerms)):
			term = queryTerms[i].lower()
			newList = self._postingsLists[term]
			for item in newList:
				docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
		docMatrix = np.zeros((len(docVectorDict), len(queryTerms)))
		docIx = 0
		docIxToDocID = {}
		for docID in docVectorDict.keys():
			docMatrix[docIx][:] = docVectorDict[docID][:]
			docIxToDocID[docIx] = docID
			docIx += 1
		sims = linear_kernel(queryVector, docMatrix).flatten()
		bestDocIxes = sims.argsort()[::-1]
		bestDocSims = sims[bestDocIxes]
		bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
		postings = zip(bestDocIDs, bestDocSims)
		self.write(json.dumps({"postings": postings}))
开发者ID:YuanyiYang,项目名称:NYUCourse,代码行数:25,代码来源:index.py


示例20: _train

    def _train(self, ds):
        """
        Train the engine.

        Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. The 'stop_words' param
        tells the TF-IDF module to ignore common english words like 'the', etc.

        Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is
        equivalent to cosine similarity).

        Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well...
        how many similar products do you really need to show?

        Similarities and their scores are stored in redis as a Sorted Set, with one set for each item.

        :param ds: A pandas dataset containing two fields: description & id
        :return: Nothin!
        """
        tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
        tfidf_matrix = tf.fit_transform(ds['content'])

        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

        for idx, row in ds.iterrows():
            similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
            similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

            # First item is the item itself, so remove it.
            # This 'sum' is turns a list of tuples into a single tuple: [(1,2), (3,4)] -> (1,2,3,4)
            flattened = sum(similar_items[1:], ())
            self._r.zadd(self.SIMKEY % row['id'], *flattened)
开发者ID:qqqwey941008,项目名称:wechat_web_scraper,代码行数:31,代码来源:engines.py



注:本文中的sklearn.metrics.pairwise.linear_kernel函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pairwise.manhattan_distances函数代码示例发布时间:2022-05-27
下一篇:
Python pairwise.euclidean_distances函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap