本文整理汇总了Python中sklearn.preprocessing.Normalizer类的典型用法代码示例。如果您正苦于以下问题:Python Normalizer类的具体用法?Python Normalizer怎么用?Python Normalizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Normalizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: normalize_test
def normalize_test():
X=[1,2,3,4,5,2,6,8]
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
X2 = normalizer.fit_transform(X)
print X2
开发者ID:swenker,项目名称:bigdata,代码行数:7,代码来源:scikit_lab.py
示例2: kfold
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3):
out = []
for i in range(k):
print "iteration: "+str(i)
agetext = shuffle(agetext)
X = agetext["text"]
X = X.tolist()
label = agetext["agegroup"].tolist()
vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
docs = []
for doc in X:
docs.append(" ".join(doc))
docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
traindocs = docs2[:7999]
X = vec.fit_transform(traindocs)
testdocs = docs2[8000:9500]
X_test = vec.transform(testdocs)
tlabel = label[:7999]
testl = label[8000:9500]
if(check):
lsa = TruncatedSVD(k2, algorithm = 'arpack')
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(X)
X = normalizer.fit_transform(X)
X_test = lsa.transform(X_test)
X_test = normalizer.transform(X_test)
model.fit(X,tlabel)
pred = model.predict(X_test)
out.append(round(accuracy_score(testl, pred),2))
print str(out)
print np.mean(out)
开发者ID:hurelyyu,项目名称:CS_Master_UW,代码行数:31,代码来源:TMClassCopy.py
示例3: TfIdf
class TfIdf(Feature):
def __init__(self):
self.kbest = None
self.vect = None
self.truncated = None
self.normalizer = None
def train(self, reviews, labels):
self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews]
tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()
self.truncated = TruncatedSVD(n_components=50)
self.truncated.fit(tfidf_matrix, labels)
trunc = self.truncated.transform(tfidf_matrix)
self.normalizer = Normalizer()
self.normalizer.fit(trunc)
self.kbest = SelectKBest(f_classif, k=5)
self.kbest.fit(self.normalizer.transform(trunc), labels)
def score(self, data):
reviews_text = ' '.join(list(chain.from_iterable(data)))
tfidf_matrix = self.vect.transform([reviews_text]).toarray()
trunc = self.truncated.transform(tfidf_matrix)
return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
开发者ID:EdwardBetts,项目名称:Yulp,代码行数:30,代码来源:tfidf.py
示例4: kfold
def kfold(agetext,k,model,k2):
import collections
out = []
for i in range(k):
print "iteration: "+str(i)
agetext = shuffle(agetext)
datatb = agetext.iloc[:,1:]
label = agetext["agegroup"].tolist()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
datatb, label, test_size=0.15, random_state=i*6)
data = X_train.values
counter = collections.Counter(y_train)
print counter
testdata = X_test.values
lsa = TruncatedSVD(k2, algorithm = 'arpack')
normalizer = Normalizer(copy=False)
X = lsa.fit_transform(data)
X = normalizer.fit_transform(X)
X_test = lsa.transform(testdata)
X_test = normalizer.transform(X_test)
model.fit(X,y_train)
pred = model.predict(X_test)
counter = collections.Counter(y_test)
print counter
counter = collections.Counter(pred)
print counter
out.append(round(accuracy_score(y_test, pred),5))
print str(out)
print np.mean(out)
开发者ID:hurelyyu,项目名称:CS_Master_UW,代码行数:29,代码来源:AgeGroup.py
示例5: preprocess
def preprocess(data, n_components, use_tf_idf=True):
"""
Preproecess the data for clustering by running SVD and
normalizing the results. This process is also known as
LSA.
arguments:
data -- Dataset, if tf_idf is Truethe object must contain a
tf_idf table alongside a raw frequencies dataframe.
n_components -- int, the number of components to use for the SVD
a minimum of 100 is recommended.
use_tf_idf -- bool, whether to use the tf-idf frequencies for the
preprocessing.
returns:
e -- float, a measure of variance explained by the SVD.
X -- np.array, an array with the data reduced to n_components.
"""
if use_tf_idf:
d = data.tf_idf.as_matrix()
else:
d = data.df.as_matrix()
svd = TruncatedSVD(n_components=n_components)
X = svd.fit_transform(d)
norm = Normalizer()
# Record a measure of explained variance
e = svd.explained_variance_ratio_.sum()*100
return e, norm.fit_transform(d)
开发者ID:marcomorucci,项目名称:Clustering-Constitutions,代码行数:29,代码来源:analyze.py
示例6: __init__
def __init__(self,
YTrain_file,
XTrain_file,
XTest_file,
output_path,
normalise,
C,
class_weight,
):
"""
Arguments:
"""
self.YTrain = joblib.load(YTrain_file)
XTrain = joblib.load(XTrain_file)
self.XTrain = XTrain.reshape(np.size(XTrain, axis=0), -1)
XTest = joblib.load(XTest_file)
self.XTest = XTest.reshape(np.size(XTest, axis=0), -1)
self.output_path = output_path
if normalise:
normalizer = Normalizer(copy=False)
normalizer.transform(self.XTrain)
normalizer.transform(self.XTest)
self.C = C
if class_weight == 'none':
class_weight = None
self.class_weight = class_weight
开发者ID:dchall88,项目名称:DIGITS,代码行数:31,代码来源:svm_train_test.py
示例7: getPcaFeatures
def getPcaFeatures(self, images, components, image_size):
imageDataset = self.getImagesAsDataset(images, image_size)
norm = Normalizer()
imageDataset = norm.fit_transform(imageDataset)
pca = PCA(n_components=components)
imageDataset = pca.fit_transform(imageDataset)
return pca, norm, imageDataset
开发者ID:tincho4t,项目名称:aaTP,代码行数:7,代码来源:ImagesProcessor.py
示例8: explore_k
def explore_k(svd_trans, k_range):
'''
Explores various values of k in KMeans
Args:
svd_trans: dense array with lsi transformed data
k_range: the range of k-values to explore
Returns:
scores: list of intertia scores for each k value
'''
scores = []
# spherical kmeans, so normalize
normalizer = Normalizer()
norm_data = normalizer.fit_transform(svd_trans)
for k in np.arange:
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
verbose=2)
km.fit(norm_data)
scores.append(-1*km.score(norm_data))
plt.plot(k_range, scores)
plt.xlabel('# of clusters')
plt.ylabel('Inertia')
sns.despine(offset=5, trim=True)
return scores
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:25,代码来源:genres.py
示例9: _normalize
def _normalize(self, X, y, X_t):
from sklearn.preprocessing import Normalizer
NORM = Normalizer()
X = NORM.fit_transform(X, y)
X_t = NORM.transform(X_t)
return X, X_t
开发者ID:mikbuch,项目名称:pymri,代码行数:8,代码来源:datasets.py
示例10: kmeans
def kmeans(tfidf, svd, svd_trans, k=200, n_words=10):
'''
Performs k-means clustering on svd transformed data and plots it
Args:
tfidf: sklearn fitted TfidfVectorizer
svd: sklearn fitted TruncatedSVD
svd_trans: dense array with lsi transformed data
k: the k in k-means
Returns:
km: the fitted KMean object
'''
# spherical kmeans, so normalize
normalizer = Normalizer()
norm_data = normalizer.fit_transform(svd_trans)
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5,
verbose=2)
km.fit(norm_data)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
terms = prettify(terms)
terms = np.array(terms)
fig = plt.figure(figsize=(10, 8))
for i in range(10):
print("Cluster {:d}:".format(i))
for ind in order_centroids[i, :n_words]:
print(' {:s}'.format(terms[ind]))
print('\n')
# Make a figure and axes with dimensions as desired.
ax = fig.add_subplot(2, 5, i+1)
ax.set_title('Cluster {:d}'.format(i+1))
component = order_centroids[i]
cmap = plt.cm.Purples
mn = np.min(component[:n_words])
mx = np.max(component[:n_words])
norm = mpl.colors.Normalize(mn, mx)
cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm,
orientation='vertical')
# sorted_component = np.sort(component)
colors = sns.color_palette('Purples', 9).as_hex()
colors = np.repeat(colors[-1], n_words)
cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1])
cb.ax.yaxis.set_tick_params(size=0)
cb.ax.tick_params(labelsize=10)
for color, tick in zip(colors, cb.ax.get_yticklabels()):
tick.set_color(color)
tick.set_fontsize(14)
cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]])
plt.tight_layout()
return km
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:58,代码来源:genres.py
示例11: readAndPreProcess
def readAndPreProcess():
print("\n\n********** CS-412 HW5 Mini Project **********")
print("************ Submitted by Sankul ************\n\n")
print("Reading data, please ensure that the dataset is in same folder.")
resp = pd.read_csv('responses.csv')
print("Data reading complete!")
print("Some stats reagarding data:")
resp.describe()
print("\nStarting pre-processing.....")
print("\nFinding missing values:")
print("Missing values found, removing them")
emptyVals = resp.isnull().sum().sort_values(ascending=False)
emptyPlot = emptyVals.plot(kind='barh', figsize = (20,35))
plt.show()
print("Empty values removed")
print("\nChecking for NaN and infinite values in target column (Empathy):")
if len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]):
print("Number of infinite or NaN values in Empathy column: ", len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]))
print("Removing them")
resp = resp[np.isfinite(resp['Empathy'])]
print("Infinite and NaN values removed")
print("\nChecking for categorical features:")
if pd.Categorical(resp).dtype.name == 'category':
print("Categorical features found. Removing them...")
resp = resp.select_dtypes(exclude=[object])
print("Categorical features removed")
print("\nReplacing NaN values with the mean value:")
resp=resp.fillna(resp.mean())
resp.isnull().sum()
print("Values replaced")
print("\nSeperating labels from data:")
Y = resp['Empathy'].values
X = resp.drop('Empathy',axis=1)
print("Labels seperated")
print("\nScaling, standardizing and normalizing the data:")
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
scaler = StandardScaler().fit(rescaledX)
standardizedX = scaler.transform(rescaledX)
normalizer = Normalizer().fit(standardizedX)
normalizedX = normalizer.transform(standardizedX)
print("Scaling, standardizing and normalizing completed")
print("\nFinal data looks like:")
print(normalizedX.shape)
print("Values inside look like:")
print(normalizedX[0])
return normalizedX,Y
开发者ID:dark-shade,项目名称:CS-412-IML-HW5-Mini-Project,代码行数:58,代码来源:hw5.py
示例12: __init__
def __init__(self, img_dir):
self._imgdir = img_dir
self._extractors = self.__get_extractors()
self._normalizer = Normalizer()
self._face_normalizer = Normalizer()
self._estimator = NearestNeighbors(n_neighbors=3)
self._face_estimator = NearestNeighbors(n_neighbors=3)
self._imgnames = []
self._face_imgnames = []
开发者ID:xulesc,项目名称:general,代码行数:9,代码来源:wally.py
示例13: ScikitNormalizer
class ScikitNormalizer(object):
def __init__(self):
self.data_normalizer = Normalizer()
def fit(self, data):
self.data_normalizer.fit(data)
def transform(self, data):
return (self.data_normalizer.transform(data) + 1) / 2
开发者ID:Falgunithakor,项目名称:SummerResearchDE-BPSO,代码行数:9,代码来源:Normalizer.py
示例14: test_ver2_syntetic_dataset
def test_ver2_syntetic_dataset(self):
self.ex = experiment.Experiment()
self.ex.cf_matrix = load_sparse_data('syntetic_cf.dat')
n = Normalizer(norm='l2', copy=True)
self.ex.cf_matrix = n.transform(self.ex.cf_matrix) #normalized.
self.ex.cb_prox = experiment.Experiment.load_data(PKL + 'cb_prox.pkl')
self.ex.cf_prox = self.ex.cf_matrix * self.ex.cf_matrix.T
self.ex.test_corr_sparsity(draw=True, interval=100)
开发者ID:osmanbaskaya,项目名称:acm_mak,代码行数:9,代码来源:unittest_experiment.py
示例15: reduce_dimension
def reduce_dimension(self, n_components=2):
""" Return PCA transform of self.data, with n_components. """
reducer = PCA(n_components=n_components)
X = self.data.values
norm = Normalizer()
Xnorm = norm.fit_transform(X)
return reducer.fit_transform(Xnorm)
开发者ID:abshinn,项目名称:practice,代码行数:11,代码来源:pima.py
示例16: normalize
def normalize(self, msi, norm="l1"):
original_shape = msi.get_image().shape
collapsed_image = collapse_image(msi.get_image())
# temporarily save mask, since scipy normalizer removes mask
is_masked_array = isinstance(msi.get_image(), np.ma.MaskedArray)
if is_masked_array:
mask = msi.get_image().mask
normalizer = Normalizer(norm=norm)
normalized_image = normalizer.transform(collapsed_image)
if is_masked_array:
normalized_image = np.ma.MaskedArray(normalized_image, mask=mask)
msi.set_image(np.reshape(normalized_image, original_shape))
开发者ID:151706061,项目名称:MITK,代码行数:12,代码来源:normalize.py
示例17: test_pipeline
def test_pipeline():
norm = Normalizer(norm='l1')
norm_id = norm.what().id()
assert norm_id == "Normalizer(norm='l1')"
kmeans = KMeans(n_clusters=12)
kmeans_id = kmeans.what().id()
print(kmeans_id)
assert kmeans_id == \
"KMeans(algorithm='auto',init='k-means++',max_iter=300,n_clusters=12,n_init=10,random_state=None,tol=0.0001)"
# noinspection PyTypeChecker
pipeline_id = Pipeline((('norm', norm), ('kmeans', kmeans))).what().id()
assert pipeline_id == "Pipeline(steps=(('norm',%s),('kmeans',%s)))" % (norm_id, kmeans_id)
开发者ID:sdvillal,项目名称:whatami,代码行数:12,代码来源:test_what_sklearn.py
示例18: make_nn_regression
def make_nn_regression(n_samples=100, n_features=100, n_informative=10,
dense=False, noise=0.0, test_size=0,
normalize_x=True, normalize_y=True,
shuffle=True, random_state=None):
X, y, w = _make_nn_regression(n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
shuffle=shuffle,
random_state=random_state)
if dense:
X = X.toarray()
if test_size > 0:
cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state,
test_size=test_size, train_size=1-test_size)
train, test = list(cv)[0]
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
if not dense:
X_train.sort_indices()
X_test.sort_indices()
else:
X_train, y_train = X, y
if not dense:
X_train.sort_indices()
X_test, y_test = None, None
# Add noise
if noise > 0.0:
generator = check_random_state(random_state)
y_train += generator.normal(scale=noise * np.std(y_train),
size=y_train.shape)
y_train = np.maximum(y_train, 0)
if normalize_x:
normalizer = Normalizer()
X_train = normalizer.fit_transform(X_train)
if X_test is not None:
X_test = normalizer.transform(X_test)
if normalize_y:
scaler = MinMaxScaler()
y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
if y_test is not None:
y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()
if X_test is not None:
return X_train, y_train, X_test, y_test, w
else:
return X_train, y_train, w
开发者ID:RPGOne,项目名称:sebabulba,代码行数:53,代码来源:samples_generator.py
示例19: KNN
class KNN(Model):
def __init__(self, X_train, y_train, X_val, y_val):
super().__init__()
self.normalizer = Normalizer()
self.normalizer.fit(X_train)
self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))
def guess(self, feature):
return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
开发者ID:codeaudit,项目名称:entity-embedding-rossmann,代码行数:12,代码来源:models.py
示例20: get_tf_idf_M
def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False):
N = len(M)
if tf == "raw":
tf_M = np.copy(M) #just the frequency of the word in a text
# #TODO: check if dnorm is implemented OK
# elif tf == "dnorm":
# tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
if idf == "c":
idf_v = []
for i in range(M.shape[1]): #get the number of texts that contain a word words[i]
idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M
idf_v = np.array(idf_v)
idf_v = np.log(N/idf_v)
tf_idf_M = tf_M*idf_v
if norm_samps:
normalizer = Normalizer()
tf_idf_M = normalizer.fit_transform(tf_idf_M)
# np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s")
return tf_idf_M
开发者ID:RokIvansek,项目名称:Spectral-clustering-HW,代码行数:19,代码来源:newsgroups20.py
注:本文中的sklearn.preprocessing.Normalizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论