本文整理汇总了Python中sklearn.datasets.get_data_home函数的典型用法代码示例。如果您正苦于以下问题:Python get_data_home函数的具体用法?Python get_data_home怎么用?Python get_data_home使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_data_home函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_data_home
def test_data_home():
# get_data_home will point to a pre-existing folder
data_home = get_data_home(data_home=DATA_HOME)
assert_equal(data_home, DATA_HOME)
assert_true(os.path.exists(data_home))
# clear_data_home will delete both the content and the folder it-self
clear_data_home(data_home=data_home)
assert_false(os.path.exists(data_home))
# if the folder is missing it will be created again
data_home = get_data_home(data_home=DATA_HOME)
assert_true(os.path.exists(data_home))
开发者ID:Calvin-O,项目名称:scikit-learn,代码行数:13,代码来源:test_base.py
示例2: setup_module
def setup_module():
check_skip_network()
# skip the test in rcv1.rst if the dataset is not already loaded
rcv1_dir = os.path.join(get_data_home(), "RCV1")
if not os.path.exists(rcv1_dir):
raise SkipTest("Download RCV1 dataset to run this test.")
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:7,代码来源:rcv1_fixture.py
示例3: get_unclassified_data
def get_unclassified_data(self):
source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease)
file_paths = []
for root, directories, files in os.walk(source_path):
for filename in files:
file_path = os.path.join(root, filename)
file_paths.append(file_path)
print 'unclassified data loaded from ' + str(file_paths)
tweets = []
for file_path in file_paths:
line_num = 0
with codecs.open(file_path, 'r') as f:
for line in f:
if line_num>0:
try:
tweets.append(Tweet(line))
line_num += 1
except:
print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0]
else:
line_num += 1
f.closed
print 'unclassified tweets loaded ' + str(len(tweets))
return tweets
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:25,代码来源:data.py
示例4: setup_working_with_text_data
def setup_working_with_text_data():
if IS_PYPY and os.environ.get('CI', None):
raise SkipTest('Skipping too slow test with PyPy on CI')
check_skip_network()
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
if not exists(cache_path):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:manhhomienbienthuy,项目名称:scikit-learn,代码行数:7,代码来源:conftest.py
示例5: fetch_vega_spectrum
def fetch_vega_spectrum(data_home=None):
data_home = get_data_home(data_home)
refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1])
if not os.path.exists(refspec_file):
print "downnloading from %s" % REFSPEC_URL
F = urllib2.urlopen(REFSPEC_URL)
open(refspec_file, 'w').write(F.read())
F = open(refspec_file)
data = np.loadtxt(F)
return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:12,代码来源:plot_sdss_filters.py
示例6: fetch_filter
def fetch_filter(filter, data_home=None):
data_home = get_data_home(data_home)
assert filter in 'ugriz'
url = URL % filter
loc = os.path.join(data_home, '%s.dat' % filter)
if not os.path.exists(loc):
print "downloading from %s" % url
F = urllib2.urlopen(url)
open(loc, 'w').write(F.read())
F = open(loc)
data = np.loadtxt(F)
return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:14,代码来源:plot_sdss_filters.py
示例7: fetch_sdss_spec_data
def fetch_sdss_spec_data(data_home=None):
data_home = get_data_home(data_home)
local_file = os.path.join(data_home, os.path.basename(DATA_URL))
# data directory is password protected so the public can't access it
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(handler)
# download training data
if not os.path.exists(local_file):
fhandle = opener.open(DATA_URL)
open(local_file, 'w').write(fhandle.read())
return np.load(local_file)
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:17,代码来源:plot_sdss_specPCA.py
示例8: stream_reuters_documents
def stream_reuters_documents(data_path=None):
"""Iterate over documents of the Reuters dataset.
The Reuters archive will automatically be downloaded and uncompressed if
the `data_path` directory does not exist.
Documents are represented as dictionaries with 'body' (str),
'title' (str), 'topics' (list(str)) keys.
"""
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if data_path is None:
data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" %
data_path)
os.mkdir(data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
end='')
archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
print("done.")
parser = ReutersParser()
for filename in glob(os.path.join(data_path, "*.sgm")):
for doc in parser.parse(open(filename, 'rb')):
#print (doc)
yield doc
开发者ID:mbonaventura,项目名称:aa2015,代码行数:44,代码来源:my_plot_out_of_core_classification.py
示例9: create_data
def create_data(self):
data_home = get_data_home()
cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)
if os.path.exists(cache_path):
return
# e.g. C:\Users\[user]\scikit_learn_data\hiv
# disease_path = os.path.join(data_home, self.disease)
# e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv
tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut)
if not os.path.exists(tweets_path):
return
'''
*** Manual process:
Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format)
*** Automated process:
1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv
2. For each file read all tweets line by line (only those where the category is not empty)
3. For each tweet generate a unique file
'''
train_path = os.path.join(tweets_path, self.train_folder)
train_output_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut)
if not os.path.exists(train_output_path):
os.makedirs(train_output_path)
test_path = os.path.join(tweets_path, self.test_folder)
test_output_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut)
if not os.path.exists(test_output_path):
os.makedirs(test_output_path)
train_tweets = self._load_tweets(train_path)
self._generate_singular_tweet_files(train_tweets, train_output_path)
test_tweets = self._load_tweets(test_path)
self._generate_singular_tweet_files(test_tweets, test_output_path)
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:37,代码来源:data.py
示例10: _fetch_drug_protein
def _fetch_drug_protein(data_home=None):
"""Fetch drug-protein dataset from the server"""
base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/"
# check if this data set has been already downloaded
data_home = get_data_home(data_home)
data_home = os.path.join(data_home, 'drug-protein')
if not os.path.exists(data_home):
os.makedirs(data_home)
for base_name in ["drug_repmat.txt", "target_repmat.txt",
"inter_admat.txt"]:
filename = os.path.join(data_home, base_name)
if not os.path.exists(filename):
urlname = base_url + base_name
print("Download data at {}".format(urlname))
try:
url = urlopen(urlname)
except HTTPError as e:
if e.code == 404:
e.msg = "Dataset drug-protein '%s' not found." % base_name
raise
try:
with open(filename, 'w+b') as fhandle:
shutil.copyfileobj(url, fhandle)
except:
os.remove(filename)
raise
url.close()
return data_home
开发者ID:arjoly,项目名称:random-output-trees,代码行数:37,代码来源:datasets.py
示例11: setup_module
def setup_module(module):
data_home = get_data_home()
if not exists(join(data_home, '20news_home')):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:4,代码来源:twenty_newsgroups_fixture.py
示例12: Memory
from sklearn.datasets import get_data_home
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.externals.joblib import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
mmap_mode='r')
@memory.cache
def load_data(dtype=np.float32, order='F'):
"""Load the data, then cache and memmap the train/test split"""
######################################################################
## Load dataset
print("Loading dataset...")
data = fetch_mldata('MNIST original')
X = check_array(data['data'], dtype=dtype, order=order)
y = data["target"]
# Normalize features
X = X / 255
开发者ID:1TTT9,项目名称:scikit-learn,代码行数:31,代码来源:bench_mnist.py
示例13: fetch_olivetti_faces
from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import get_data_home
if __name__ == "__main__":
fetch_olivetti_faces()
print("Loading Labeled Faces Data (~200MB)")
fetch_lfw_people(min_faces_per_person=70, resize=0.4)
print("=> Success!")
print("Data saved in %s" % get_data_home())
开发者ID:JeanKossaifi,项目名称:workshop_python,代码行数:12,代码来源:fetch_data.py
示例14: zip
plt.legend(loc='upper right')
plt.show()
'''
x_index = 0
y_index = 3
'''
for label,color in zip(range(len(d1.target_names)),colors):
plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图
plt.xlabel(d1.feature_names[x_index])
plt.xlabel(d1.feature_names[y_index])
plt.legend(loc='upper left')
plt.show()
'''
'''
fig = plt.figure(figsize=(6,6))
fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(64):
ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest")
ax.text(0,7,str(d3.target[i]))
plt.show()
'''
#china = datasets.load_sample_image('china.jpg')
print(datasets.get_data_home())
开发者ID:xxwei,项目名称:TraderCode,代码行数:30,代码来源:sklearnData.py
示例15: list
chunk_size = 1000
data_chunks = list(partition(chunk_size, testData))
print ('start prediction')
for i,chunk in enumerate(data_chunks):
t0 = time()
predicted = clf.classifier.predict(list(chunk))
ranTime = time() - t0
print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
for j in range(len(chunk)):
testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])
print ('predict done')
file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)
if not os.path.exists(file_dir):
os.makedirs(file_dir)
file_path = os.path.join(file_dir, 'output.txt')
with codecs.open(file_path, "w", "utf-8") as text_file:
for i in range(len(testData)):
try:
tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \
format(testData[i].tweet_id,
testData[i].query,
testData[i].disease,
testData[i].created_at,
testData[i].screen_name,
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:31,代码来源:Classifier_aboutself_vs_aboutothers.py
示例16: setup_labeled_faces
def setup_labeled_faces():
data_home = get_data_home()
if not exists(join(data_home, 'lfw_home')):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:4,代码来源:conftest.py
示例17: setup_twenty_newsgroups
def setup_twenty_newsgroups():
data_home = get_data_home()
if not exists(join(data_home, '20news_home')):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:4,代码来源:conftest.py
示例18: setup_twenty_newsgroups
def setup_twenty_newsgroups():
data_home = get_data_home()
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
if not exists(cache_path):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:manhhomienbienthuy,项目名称:scikit-learn,代码行数:5,代码来源:conftest.py
示例19: get_data
def get_data(self, subset='train', categories=None, shuffle=True, random_state=42):
data_home = get_data_home()
cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)
train_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut)
test_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut)
cache = None
if os.path.exists(cache_path):
try:
with open(cache_path, 'rb') as f:
compressed_content = f.read()
uncompressed_content = codecs.decode(
compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
except Exception as e:
print(80 * '_')
print('Cache loading failed')
print(80 * '_')
print(e)
if cache is None:
cache = self.get_cache(train_path=train_path, test_path=test_path, cache_path=cache_path)
if subset in ('train', 'test'):
data = cache[subset]
elif subset == 'all':
data_lst = list()
target = list()
filenames = list()
for subset in ('train', 'test'):
data = cache[subset]
data_lst.extend(data.data)
target.extend(data.target)
filenames.extend(data.filenames)
data.data = data_lst
data.target = np.array(target)
data.filenames = np.array(filenames)
else:
raise ValueError(
"subset can only be 'train', 'test' or 'all', got '%s'" % subset)
data.description = 'The HIV dataset'
if categories is not None:
labels = [(data.target_names.index(cat), cat) for cat in categories]
# Sort the categories to have the ordering of the labels
labels.sort()
labels, categories = zip(*labels)
mask = np.in1d(data.target, labels)
data.filenames = data.filenames[mask]
data.target = data.target[mask]
# searchsorted to have continuous labels
data.target = np.searchsorted(labels, data.target)
data.target_names = list(categories)
# Use an object array to shuffle: avoids memory copy
data_lst = np.array(data.data, dtype=object)
data_lst = data_lst[mask]
data.data = data_lst.tolist()
if shuffle:
random_state = validation.check_random_state(random_state)
indices = np.arange(data.target.shape[0])
random_state.shuffle(indices)
data.filenames = data.filenames[indices]
data.target = data.target[indices]
# Use an object array to shuffle: avoids memory copy
data_lst = np.array(data.data, dtype=object)
data_lst = data_lst[indices]
data.data = data_lst.tolist()
return data
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:71,代码来源:data.py
示例20: setup_working_with_text_data
def setup_working_with_text_data():
check_skip_network()
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
if not exists(cache_path):
raise SkipTest("Skipping dataset loading doctests")
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:5,代码来源:conftest.py
注:本文中的sklearn.datasets.get_data_home函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论