• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python datasets.get_data_home函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.datasets.get_data_home函数的典型用法代码示例。如果您正苦于以下问题:Python get_data_home函数的具体用法?Python get_data_home怎么用?Python get_data_home使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了get_data_home函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_data_home

def test_data_home():
    # get_data_home will point to a pre-existing folder
    data_home = get_data_home(data_home=DATA_HOME)
    assert_equal(data_home, DATA_HOME)
    assert_true(os.path.exists(data_home))

    # clear_data_home will delete both the content and the folder it-self
    clear_data_home(data_home=data_home)
    assert_false(os.path.exists(data_home))

    # if the folder is missing it will be created again
    data_home = get_data_home(data_home=DATA_HOME)
    assert_true(os.path.exists(data_home))
开发者ID:Calvin-O,项目名称:scikit-learn,代码行数:13,代码来源:test_base.py


示例2: setup_module

def setup_module():
    check_skip_network()

    # skip the test in rcv1.rst if the dataset is not already loaded
    rcv1_dir = os.path.join(get_data_home(), "RCV1")
    if not os.path.exists(rcv1_dir):
        raise SkipTest("Download RCV1 dataset to run this test.")
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:7,代码来源:rcv1_fixture.py


示例3: get_unclassified_data

    def get_unclassified_data(self):
        source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease)
        file_paths = []
        for root, directories, files in os.walk(source_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        print 'unclassified data loaded from ' + str(file_paths)

        tweets = []
        for file_path in file_paths:
            line_num = 0
            with codecs.open(file_path, 'r') as f:
                for line in f:
                    if line_num>0:
                        try:
                            tweets.append(Tweet(line))
                            line_num += 1
                        except:
                            print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0]
                    else:
                        line_num += 1
            f.closed
        print 'unclassified tweets loaded ' + str(len(tweets))
        return tweets
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:25,代码来源:data.py


示例4: setup_working_with_text_data

def setup_working_with_text_data():
    if IS_PYPY and os.environ.get('CI', None):
        raise SkipTest('Skipping too slow test with PyPy on CI')
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:manhhomienbienthuy,项目名称:scikit-learn,代码行数:7,代码来源:conftest.py


示例5: fetch_vega_spectrum

def fetch_vega_spectrum(data_home=None):
    data_home = get_data_home(data_home)
    refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1])
    if not os.path.exists(refspec_file):
        print "downnloading from %s" % REFSPEC_URL
        F = urllib2.urlopen(REFSPEC_URL)
        open(refspec_file, 'w').write(F.read())

    F = open(refspec_file)

    data = np.loadtxt(F)
    return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:12,代码来源:plot_sdss_filters.py


示例6: fetch_filter

def fetch_filter(filter, data_home=None):
    data_home = get_data_home(data_home)
    assert filter in 'ugriz'
    url = URL % filter
    loc = os.path.join(data_home, '%s.dat' % filter)
    if not os.path.exists(loc):
        print "downloading from %s" % url
        F = urllib2.urlopen(url)
        open(loc, 'w').write(F.read())

    F = open(loc)

    data = np.loadtxt(F)
    return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:14,代码来源:plot_sdss_filters.py


示例7: fetch_sdss_spec_data

def fetch_sdss_spec_data(data_home=None):
    data_home = get_data_home(data_home)

    local_file = os.path.join(data_home, os.path.basename(DATA_URL))

    # data directory is password protected so the public can't access it    
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
    handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(handler)

    # download training data
    if not os.path.exists(local_file):
        fhandle = opener.open(DATA_URL)
        open(local_file, 'w').write(fhandle.read())

    return np.load(local_file)
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:17,代码来源:plot_sdss_specPCA.py


示例8: stream_reuters_documents

def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            #print (doc)
            yield doc
开发者ID:mbonaventura,项目名称:aa2015,代码行数:44,代码来源:my_plot_out_of_core_classification.py


示例9: create_data

    def create_data(self):
        data_home = get_data_home()
        cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)

        if os.path.exists(cache_path):
            return

        # e.g. C:\Users\[user]\scikit_learn_data\hiv
        # disease_path = os.path.join(data_home, self.disease)
        # e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv
        tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut)
        if not os.path.exists(tweets_path):
            return
        '''
        *** Manual process:
        Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format)

        *** Automated process:
        1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv
        2. For each file read all tweets line by line (only those where the category is not empty)
        3. For each tweet generate a unique file
        '''

        train_path = os.path.join(tweets_path, self.train_folder)
        train_output_path = os.path.join(data_home, self.train_folder,  self.disease + self._cl_cut)
        if not os.path.exists(train_output_path):
            os.makedirs(train_output_path)

        test_path = os.path.join(tweets_path, self.test_folder)
        test_output_path = os.path.join(data_home, self.test_folder,  self.disease + self._cl_cut)
        if not os.path.exists(test_output_path):
            os.makedirs(test_output_path)

        train_tweets = self._load_tweets(train_path)
        self._generate_singular_tweet_files(train_tweets, train_output_path)
        test_tweets = self._load_tweets(test_path)
        self._generate_singular_tweet_files(test_tweets, test_output_path)
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:37,代码来源:data.py


示例10: _fetch_drug_protein

def _fetch_drug_protein(data_home=None):
    """Fetch drug-protein dataset from the server"""

    base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/"

    # check if this data set has been already downloaded
    data_home = get_data_home(data_home)
    data_home = os.path.join(data_home, 'drug-protein')
    if not os.path.exists(data_home):
        os.makedirs(data_home)

    for base_name in ["drug_repmat.txt", "target_repmat.txt",
                      "inter_admat.txt"]:
        filename = os.path.join(data_home, base_name)

        if not os.path.exists(filename):
            urlname = base_url + base_name

            print("Download data at {}".format(urlname))

            try:
                url = urlopen(urlname)
            except HTTPError as e:
                if e.code == 404:
                    e.msg = "Dataset drug-protein '%s' not found." % base_name
                raise

            try:
                with open(filename, 'w+b') as fhandle:
                    shutil.copyfileobj(url, fhandle)
            except:
                os.remove(filename)
                raise

            url.close()

    return data_home
开发者ID:arjoly,项目名称:random-output-trees,代码行数:37,代码来源:datasets.py


示例11: setup_module

def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:4,代码来源:twenty_newsgroups_fixture.py


示例12: Memory

from sklearn.datasets import get_data_home
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.externals.joblib import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
开发者ID:1TTT9,项目名称:scikit-learn,代码行数:31,代码来源:bench_mnist.py


示例13: fetch_olivetti_faces

from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import get_data_home


if __name__ == "__main__":
    fetch_olivetti_faces()

    print("Loading Labeled Faces Data (~200MB)")
    fetch_lfw_people(min_faces_per_person=70, resize=0.4)
    print("=> Success!")
    print("Data saved in %s" % get_data_home())
开发者ID:JeanKossaifi,项目名称:workshop_python,代码行数:12,代码来源:fetch_data.py


示例14: zip

plt.legend(loc='upper right')
plt.show()
'''
x_index = 0
y_index = 3
'''
for label,color in zip(range(len(d1.target_names)),colors):
    plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图

plt.xlabel(d1.feature_names[x_index])
plt.xlabel(d1.feature_names[y_index])
plt.legend(loc='upper left')
plt.show()

'''

'''
fig = plt.figure(figsize=(6,6))
fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)

for i in range(64):
    ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
    ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest")
    ax.text(0,7,str(d3.target[i]))
plt.show()
'''

#china = datasets.load_sample_image('china.jpg')

print(datasets.get_data_home())
开发者ID:xxwei,项目名称:TraderCode,代码行数:30,代码来源:sklearnData.py


示例15: list

    chunk_size = 1000
    data_chunks = list(partition(chunk_size, testData))

    print ('start prediction')

    for i,chunk in enumerate(data_chunks):
        t0 = time()
        predicted = clf.classifier.predict(list(chunk))
        ranTime = time() - t0
        print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
        for j in range(len(chunk)):
            testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])

    print ('predict done')

    file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)

    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    file_path = os.path.join(file_dir, 'output.txt')

    with codecs.open(file_path, "w", "utf-8") as text_file:
        for i in range(len(testData)):
            try:
                tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \
                    format(testData[i].tweet_id,
                           testData[i].query,
                           testData[i].disease,
                           testData[i].created_at,
                           testData[i].screen_name,
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:31,代码来源:Classifier_aboutself_vs_aboutothers.py


示例16: setup_labeled_faces

def setup_labeled_faces():
    data_home = get_data_home()
    if not exists(join(data_home, 'lfw_home')):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:4,代码来源:conftest.py


示例17: setup_twenty_newsgroups

def setup_twenty_newsgroups():
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:4,代码来源:conftest.py


示例18: setup_twenty_newsgroups

def setup_twenty_newsgroups():
    data_home = get_data_home()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:manhhomienbienthuy,项目名称:scikit-learn,代码行数:5,代码来源:conftest.py


示例19: get_data

    def get_data(self, subset='train', categories=None, shuffle=True, random_state=42):
        data_home = get_data_home()
        cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)
        train_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut)
        test_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut)
        cache = None
        if os.path.exists(cache_path):
            try:
                with open(cache_path, 'rb') as f:
                    compressed_content = f.read()
                uncompressed_content = codecs.decode(
                    compressed_content, 'zlib_codec')
                cache = pickle.loads(uncompressed_content)
            except Exception as e:
                print(80 * '_')
                print('Cache loading failed')
                print(80 * '_')
                print(e)

        if cache is None:
            cache = self.get_cache(train_path=train_path, test_path=test_path, cache_path=cache_path)

        if subset in ('train', 'test'):
            data = cache[subset]
        elif subset == 'all':
            data_lst = list()
            target = list()
            filenames = list()
            for subset in ('train', 'test'):
                data = cache[subset]
                data_lst.extend(data.data)
                target.extend(data.target)
                filenames.extend(data.filenames)

            data.data = data_lst
            data.target = np.array(target)
            data.filenames = np.array(filenames)
        else:
            raise ValueError(
                "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

        data.description = 'The HIV dataset'

        if categories is not None:
            labels = [(data.target_names.index(cat), cat) for cat in categories]
            # Sort the categories to have the ordering of the labels
            labels.sort()
            labels, categories = zip(*labels)
            mask = np.in1d(data.target, labels)
            data.filenames = data.filenames[mask]
            data.target = data.target[mask]
            # searchsorted to have continuous labels
            data.target = np.searchsorted(labels, data.target)
            data.target_names = list(categories)
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[mask]
            data.data = data_lst.tolist()

        if shuffle:
            random_state = validation.check_random_state(random_state)
            indices = np.arange(data.target.shape[0])
            random_state.shuffle(indices)
            data.filenames = data.filenames[indices]
            data.target = data.target[indices]
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[indices]
            data.data = data_lst.tolist()

        return data
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:71,代码来源:data.py


示例20: setup_working_with_text_data

def setup_working_with_text_data():
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:5,代码来源:conftest.py



注:本文中的sklearn.datasets.get_data_home函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python datasets.load_boston函数代码示例发布时间:2022-05-27
下一篇:
Python datasets.fetch_olivetti_faces函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap