本文整理汇总了Python中skbio.read函数的典型用法代码示例。如果您正苦于以下问题:Python read函数的具体用法?Python read怎么用?Python read使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了read函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_invalid_files
def test_invalid_files(self):
for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]:
for invalid, kwargs, errors, etype in self.invalid_files:
with self.assertRaises(etype) as cm:
for kwarg in kwargs:
_drop_kwargs(kwarg, "constructor", "filter")
read(invalid, format="qseq", verify=False, into=constructor, **kwarg)
for e in errors:
self.assertIn(e, str(cm.exception))
开发者ID:Kleptobismol,项目名称:scikit-bio,代码行数:10,代码来源:test_qseq.py
示例2: test_invalid_files
def test_invalid_files(self):
for constructor in [Sequence, DNA, RNA, Protein]:
for invalid, kwargs, errors, etype in self.invalid_files:
with self.assertRaises(etype) as cm:
for kwarg in kwargs:
_drop_kwargs(kwarg, 'constructor', 'filter')
read(invalid, format='qseq', verify=False,
into=constructor, **kwarg)
for e in errors:
self.assertIn(e, str(cm.exception))
开发者ID:Achuth17,项目名称:scikit-bio,代码行数:11,代码来源:test_qseq.py
示例3: test_dna_iterator_to_dna_fasta_format
def test_dna_iterator_to_dna_fasta_format(self):
transformer = self.get_transformer(DNAIterator, DNAFASTAFormat)
filepath = self.get_data_path('dna-sequences.fasta')
generator = skbio.read(filepath, format='fasta', constructor=skbio.DNA)
input = DNAIterator(generator)
obs = transformer(input)
self.assertIsInstance(obs, DNAFASTAFormat)
obs = skbio.read(str(obs), format='fasta', constructor=skbio.DNA)
for act, exp in zip(obs, input):
self.assertEqual(act, exp)
开发者ID:BenKaehler,项目名称:q2-types,代码行数:12,代码来源:test_transformer.py
示例4: test_pair_dna_sequences_directory_format_to_pair_dna_iterator
def test_pair_dna_sequences_directory_format_to_pair_dna_iterator(self):
filenames = ('left-dna-sequences.fasta', 'right-dna-sequences.fasta')
input, obs = self.transform_format(PairedDNASequencesDirectoryFormat,
PairedDNAIterator,
filenames=filenames)
exp_left = skbio.read(self.get_data_path(filenames[0]),
format='fasta', constructor=skbio.DNA)
exp_right = skbio.read(self.get_data_path(filenames[1]),
format='fasta', constructor=skbio.DNA)
for act, exp in zip(obs, zip(exp_left, exp_right)):
self.assertEqual(act, exp)
self.assertIsInstance(obs, PairedDNAIterator)
开发者ID:BenKaehler,项目名称:q2-types,代码行数:13,代码来源:test_transformer.py
示例5: _annotate_fp
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1,
outfmt='tab', params=None) -> pd.DataFrame:
'''Annotate the sequences in the file.
Parameters
----------
params : dict-like
Parameters for diamond blastp/blastx that pass to ``run_blast``.
'''
found = []
res = pd.DataFrame()
for db in self.dat:
out_prefix = splitext(basename(db))[0]
daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
self.run_blast(fp, daa_fp, db, aligner=aligner,
evalue=evalue, cpus=cpus, params=params)
self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
res = res.append(self.parse_tabular(out_fp))
found.extend(res.index)
# save to a tmp file the seqs that do not hit current database
new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
with open(new_fp, 'w') as f:
for seq in read(fp, format='fasta'):
if seq.metadata['id'] not in found:
seq.write(f, format='fasta')
# no seq left
if stat(new_fp).st_size == 0:
break
else:
fp = new_fp
return res
开发者ID:tkosciol,项目名称:micronota,代码行数:32,代码来源:diamond.py
示例6: setUp
def setUp(self):
super().setUp()
tests = ('blastp', 'WP_009885814.faa')
self.blast = (tests[0], get_data_path(tests[1]),
_get_named_data_path('%s.diamond' % tests[1]))
seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta')
self.cache = DiamondCache(list(seqs))
开发者ID:elsherbini,项目名称:micronota,代码行数:7,代码来源:test_diamond.py
示例7: test_valid_files
def test_valid_files(self):
for constructor in [Sequence, DNA, RNA, Protein]:
for valid, kwargs, components in self.valid_files:
for observed_kwargs in kwargs:
expected_kwargs = {}
# Currently not validating the alphabet for qseq
# files that are read in for this test.
if hasattr(constructor, 'alphabet'):
observed_kwargs['validate'] = False
expected_kwargs['validate'] = False
_drop_kwargs(observed_kwargs, 'constructor', 'filter')
seq_num = observed_kwargs.get('seq_num', 1)
c = components[seq_num - 1]
expected = constructor(
c['sequence'],
metadata={'id': c['id'],
'machine_name': c['machine_name'],
'run_number': c['run_number'],
'lane_number': c['lane_number'],
'tile_number': c['tile_number'],
'x': c['x'],
'y': c['y'],
'index': c['index'],
'read_number': c['read_number']},
positional_metadata={
'quality': np.array(c['quality'], np.uint8)},
**expected_kwargs)
observed = read(valid, into=constructor,
format='qseq', verify=False,
**observed_kwargs)
self.assertEqual(observed, expected)
开发者ID:Achuth17,项目名称:scikit-bio,代码行数:33,代码来源:test_qseq.py
示例8: body_site
def body_site(coords, mapping_file, output, filename, sample):
"""Generates a bodysite figure for a sample in the coordinates file"""
o = read(coords, into=OrdinationResults)
# coordinates
c_df = pd.DataFrame(o.site, o.site_ids)
# mapping file
mf = pd.read_csv(mapping_file, sep='\t', dtype=str)
mf.set_index('#SampleID', inplace=True)
mf = mf.loc[o.site_ids]
if sample not in o.site_ids:
raise ValueError("Sample %s not found" % sample)
color_hmp_fecal = sns.color_palette('Paired', 12)[10] # light brown
color_agp_fecal = sns.color_palette('Paired', 12)[11] # dark brown
color_hmp_oral = sns.color_palette('Paired', 12)[0] # light blue
color_agp_oral = sns.color_palette('Paired', 12)[1] # dark blue
color_hmp_skin = sns.color_palette('Paired', 12)[2] # light green
color_agp_skin = sns.color_palette('Paired', 12)[3] # dark green
grp_colors = {'AGP-FECAL': color_agp_fecal,
'AGP-ORAL': color_agp_oral,
'AGP-SKIN': color_agp_skin,
'HMP-FECAL': color_hmp_fecal,
'GG-FECAL': color_hmp_fecal,
'PGP-FECAL': color_hmp_fecal,
'HMP-ORAL': color_hmp_oral,
'PGP-ORAL': color_hmp_oral,
'HMP-SKIN': color_hmp_skin,
'PGP-SKIN': color_hmp_skin}
# plot categories as 50 slices with random zorder
for grp, color in grp_colors.iteritems():
sub_coords = c_df[mf.TITLE_BODY_SITE == grp].values
for i in np.array_split(sub_coords, 50):
if i.size == 0:
continue
plt.scatter(i[:, 0], i[:, 1], color=color,
edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH,
alpha=ALPHA, zorder=np.random.rand())
# plot participant's dot
plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
s=270, edgecolor='w', zorder=1, lw=LINE_WIDTH_WHITE)
plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
s=250, edgecolor=np.asarray(
grp_colors[mf.loc[sample]['TITLE_BODY_SITE']])*0.6,
zorder=2, lw=LINE_WIDTH_BLACK)
plt.axis('off')
my_dpi = 72
figsize = (1000 / my_dpi, 1000 / my_dpi)
out_file = os.path.join(output, filename)
plt.savefig(out_file, figsize=figsize, dpi=my_dpi)
plt.close()
开发者ID:Pratap5387,项目名称:American-Gut,代码行数:60,代码来源:mod2_pcoa.py
示例9: test_fastq_to_sequence
def test_fastq_to_sequence(self):
for constructor in [partial(Sequence), partial(DNA, validate=False),
partial(RNA, validate=False),
partial(Protein, validate=False)]:
for valid_files, kwargs, components in self.valid_configurations:
for valid in valid_files:
# skip empty file case since we cannot read a specific
# sequencefrom an empty file
if len(components) == 0:
continue
for kwarg in kwargs:
_drop_kwargs(kwarg, 'constructor')
seq_num = kwarg.get('seq_num', 1)
c = components[seq_num - 1]
expected = \
constructor(
c[2], metadata={'id': c[0],
'description': c[1]},
positional_metadata={'quality': np.array(c[3],
dtype=np.uint8)})
observed = read(valid, into=constructor.func,
format='fastq', verify=False, **kwarg)
self.assertEqual(observed, expected)
开发者ID:7924102,项目名称:scikit-bio,代码行数:26,代码来源:test_fastq.py
示例10: fungi_from_fasta
def fungi_from_fasta(fasta_fh, accession_fh, taxonomy_fh):
"""Filter SILVA sequences to keep only fungi.
Filters a fasta file of aligned or unaligned sequences to include only
fungi. Only keeps sequences that have accession numbers that can be mapped
to a fungal taxonomy string that ends at the genus rank.
Parameters
----------
fasta_fh : filehandle
Fasta file of aligned or unaligned SILVA sequences. Each sequence
identifier must be an accession number.
accession_fh : filehandle
A tab-separated file mapping accession numbers to a mapping number in
`taxonomy_map`. This file should contain exactly two columns:
accession number and mapping number.
taxonomy_fh: filehandle
A tab-separated file that identifes the taxonomy and rank of a mapping
number in `accession_fh`. This file should contain exactly five
columns beginning with taxonomy, mapping number and rank. The last two
columns are ignored.
Returns
-------
generator
Yields ``skbio.BiologicalSequence`` objects.
"""
accession_map = _parse_accession_map(accession_fh)
taxonomy_map = _parse_taxonomy_map(taxonomy_fh)
for seq in skbio.read(fasta_fh, format="fasta"):
map_num = accession_map[seq.id]
if map_num in taxonomy_map:
yield seq
开发者ID:wasade,项目名称:ghost-tree,代码行数:34,代码来源:filter.py
示例11: _parse_fasta_dictionary
def _parse_fasta_dictionary(self):
fasta_dictionary = {}
sequence_type = self.sequence_type
for seq_entry in read(self.fasta_path, format="fasta"):
seq_id = seq_entry.metadata["id"]
fasta_dictionary[seq_id] = sequence_type(seq_entry)
return fasta_dictionary
开发者ID:gravity226,项目名称:pyensembl,代码行数:7,代码来源:sequence_data.py
示例12: test_dna_fasta_format_to_dna_iterator
def test_dna_fasta_format_to_dna_iterator(self):
input, obs = self.transform_format(DNAFASTAFormat, DNAIterator,
filename='dna-sequences.fasta')
exp = skbio.read(str(input), format='fasta', constructor=skbio.DNA)
for observed, expected in zip(obs, exp):
self.assertEqual(observed, expected)
开发者ID:BenKaehler,项目名称:q2-types,代码行数:8,代码来源:test_transformer.py
示例13: gradient
def gradient(coords, mapping_file, color, output, filename, sample):
"""Generates as many figures as samples in the coordinates file"""
o = read(coords, into=OrdinationResults)
# coordinates
c_df = pd.DataFrame(o.site, o.site_ids)
# mapping file
mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
dtype=str)
mf.set_index('#SampleID', inplace=True)
mf = mf.loc[o.site_ids]
mf[color] = mf[color].convert_objects(convert_numeric=True)
if sample not in o.site_ids:
raise ValueError("Sample %s not found" % sample)
numeric = mf[~pd.isnull(mf[color])]
non_numeric = mf[pd.isnull(mf[color])]
color_array = plt.cm.RdBu(numeric[color]/max(numeric[color]))
# plot numeric metadata as colored gradient
ids = numeric.index
x, y = c_df.loc[ids][0], c_df.loc[ids][1]
plt.scatter(x, y, c=numeric[color], cmap=plt.get_cmap('RdBu'),
alpha=ALPHA, lw=LINE_WIDTH, edgecolor=color_array*0.6)
# plot non-numeric metadata as gray
ids = non_numeric.index
x, y = c_df.loc[ids][0], c_df.loc[ids][1]
plt.scatter(x, y, c='0.5', alpha=ALPHA, lw=LINE_WIDTH, edgecolor='0.3')
# plot individual's dot
try:
color_index = numeric.index.tolist().index(sample)
except ValueError:
color_index = None
if color_index is None:
_color = (0.5, 0.5, 0.5)
else:
_color = color_array[color_index]
plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
color=_color, s=270, edgecolor='w', lw=LINE_WIDTH_WHITE)
plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
color=_color, s=250, edgecolor=np.asarray(_color)*0.6,
lw=LINE_WIDTH_BLACK)
plt.axis('off')
my_dpi = 72
figsize = (1000 / my_dpi, 1000 / my_dpi)
out_file = os.path.join(output, filename)
plt.savefig(out_file, figsize=figsize, dpi=my_dpi)
plt.close()
开发者ID:Pratap5387,项目名称:American-Gut,代码行数:56,代码来源:mod2_pcoa.py
示例14: sort_uniref
def sort_uniref(db_fp, uniref_fp, out_d, resolution, force=False):
'''Sort UniRef sequences into different partitions.
This will sort UniRef100 seq into following partitions based on both
quality and taxon:
* ``uniref100/Swiss-Prot_Archaea.fasta``
* ``uniref100/Swiss-Prot_Bacteria.fasta``
* ``uniref100/Swiss-Prot_Viruses.fasta``
* ``uniref100/Swiss-Prot_other.fasta``
* ``uniref100/Swiss-Prot_Eukaryota.fasta``
* ``uniref100/TrEMBL_Archaea.fasta``
* ``uniref100/TrEMBL_Bacteria.fasta``
* ``uniref100/TrEMBL_Viruses.fasta``
* ``uniref100/TrEMBL_other.fasta``
* ``uniref100/TrEMBL_Eukaryota.fasta``
* ``uniref100/_other.fasta``
Parameters
----------
db_fp : str
The database file created by ``prepare_metadata``.
uniref_fp : str
The UniRef100 fasta file. gzipped or not.
out_d : str
The output directory to place the resulting fasta files.
'''
_overwrite(out_d, force)
makedirs(out_d)
logger = getLogger(__name__)
logger.info('Sorting UniRef sequences')
fns = ['%s_%s' % (i, j) for i, j in product(_status, _kingdom)]
fns.append('_other')
fps = [join(out_d, 'uniref%d_%s.fasta' % (resolution, f)) for f in fns]
files = {fn: open(fp, 'w') for fp, fn in zip(fps, fns)}
with connect(db_fp) as conn:
cursor = conn.cursor()
for seq in read(uniref_fp, format='fasta', constructor=Sequence):
id = seq.metadata['id']
ac = id.replace('UniRef%d_' % resolution, '')
group = ['', 'other']
cursor.execute('''SELECT * FROM metadata
WHERE ac = ?''',
(ac,))
for _, s, k in cursor.fetchall():
group[0] = _status[s]
group[1] = _kingdom[k]
seq.write(files['_'.join(group)])
for f in files:
files[f].close()
for fp in fps:
# if the fasta file is not empty
if stat(fp).st_size > 0:
make_db(fp)
开发者ID:elsherbini,项目名称:micronota,代码行数:56,代码来源:_uniref.py
示例15: test_pair_dna_iterator_to_pair_dna_sequences_directory_format
def test_pair_dna_iterator_to_pair_dna_sequences_directory_format(self):
transformer = self.get_transformer(PairedDNAIterator,
PairedDNASequencesDirectoryFormat)
l_seqs = skbio.read(self.get_data_path('left-dna-sequences.fasta'),
format='fasta', constructor=skbio.DNA)
r_seqs = skbio.read(self.get_data_path('right-dna-sequences.fasta'),
format='fasta', constructor=skbio.DNA)
input = PairedDNAIterator(zip(l_seqs, r_seqs))
obs = transformer(input)
obs_l = skbio.read('%s/left-dna-sequences.fasta' % str(obs),
format='fasta', constructor=skbio.DNA)
obs_r = skbio.read('%s/right-dna-sequences.fasta' % str(obs),
format='fasta', constructor=skbio.DNA)
for act, exp in zip(zip(obs_l, obs_r), zip(l_seqs, r_seqs)):
self.assertEqual(act, exp)
self.assertIsInstance(obs, PairedDNASequencesDirectoryFormat)
开发者ID:BenKaehler,项目名称:q2-types,代码行数:19,代码来源:test_transformer.py
示例16: _annotate_fp
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1,
outfmt='sam', params=None):
'''Annotate the sequences in the file.'''
if self.has_cache() and not self.cache.is_empty():
self.cache.build()
dbs = [self.cache.db] + self.dat
else:
dbs = self.dat
seqs = []
found = set()
res = pd.DataFrame()
logger = getLogger(__name__)
for db in dbs:
out_prefix = splitext(basename(db))[0]
daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
self.run_blast(fp, daa_fp, db, aligner=aligner,
evalue=evalue, cpus=cpus, params=params)
self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
# res = res.append(self.parse_tabular(out_fp))
if outfmt == 'tab':
res = res.append(
self._filter_best(self.parse_tabular(out_fp)))
elif outfmt == 'sam':
res = res.append(
self._filter_id_cov(self.parse_sam(out_fp)))
# save to a tmp file the seqs that do not hit current database
new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
found = found | set(res.index)
with open(new_fp, 'w') as f:
for seq in read(fp, format='fasta'):
if seq.metadata['id'] not in found:
seq.write(f, format='fasta')
logger.info('Number of diamond hits: %d' % len(res.index))
# no seq left
if stat(new_fp).st_size == 0:
break
else:
fp = new_fp
if outfmt == 'sam' and self.has_cache():
for x in res.index:
seqs.append(
Sequence(res.loc[x, 'sseq'],
metadata={'id': res.loc[x, 'sseqid']}))
# Update cache (inplace)
if self.has_cache():
self.cache.update(seqs)
self.cache.close()
return res
开发者ID:sjanssen2,项目名称:micronota,代码行数:54,代码来源:diamond.py
示例17: setUp
def setUp(self):
super().setUp()
cases = [('blastp', 'WP_009885814.faa'),
('blastx', 'WP_009885814.fna')]
Test = namedtuple('Test', ['aligner', 'input', 'exp'])
self.tests = [Test(i[0],
get_data_path(i[1]),
_get_named_data_path('%s.diamond' % i[1]))
for i in cases]
seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta')
self.cache = DiamondCache(list(seqs))
开发者ID:sjanssen2,项目名称:micronota,代码行数:12,代码来源:test_diamond.py
示例18: subsample_dm
def subsample_dm(distmat, mapping_file, max, category, output):
"""Subsample the distmat to max samples per category value"""
mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
index_col='#SampleID')
id_to_cat = dict(mf[category])
def bin_f(x):
return id_to_cat[x]
dm = read(distmat, into=DistanceMatrix)
dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)])
dm.to_file(output)
开发者ID:jnpaulson,项目名称:American-Gut,代码行数:12,代码来源:mod2_pcoa.py
示例19: test_valid_files
def test_valid_files(self):
for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]:
for valid, kwargs, components in self.valid_files:
for kwarg in kwargs:
_drop_kwargs(kwarg, "constructor", "filter")
seq_num = kwarg.get("seq_num", 1)
c = components[seq_num - 1]
expected = constructor(c[1], id=c[0], quality=c[2])
observed = read(valid, into=constructor, format="qseq", verify=False, **kwarg)
self.assertTrue(observed.equals(expected))
开发者ID:Kleptobismol,项目名称:scikit-bio,代码行数:12,代码来源:test_qseq.py
示例20: parse_sam
def parse_sam(diamond_res, column=None, collapse=False):
'''Parse the output of diamond blastp/blastx.
Parameters
----------
diamond_res : str
file path
column : str
The column used to pick the best hits.
Returns
-------
pandas.DataFrame
The best matched records for each query sequence.
'''
seqs = read(diamond_res, format='sam')
columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send',
'evalue', 'bitscore', 'sequence']
df = pd.DataFrame(columns=columns)
for i, seq in enumerate(seqs):
s = str(seq)
qseqid = seq.metadata['QNAME']
sseqid = seq.metadata['RNAME']
pident = seq.metadata['ZI']
length = seq.metadata['ZL']
mismatch = seq.metadata['CIGAR']
gapopen = ''
qstart = seq.metadata['POS']
qend = ''
sstart = seq.metadata['ZS']
send = ''
evalue = seq.metadata['ZE']
bitscore = seq.metadata['ZR']
row = pd.Series([qseqid, sseqid, pident,
length, mismatch, gapopen,
qstart, qend, sstart, send,
evalue, bitscore, s],
index=columns)
df.loc[i] = row
if column is not None:
idx = df.groupby('qseqid')[column].idxmax()
df_max = df.loc[idx]
df_max.index = idx.index
df = df_max[['sseqid', 'evalue', 'bitscore', 'sequence']]
else:
df = df[['sseqid', 'evalue', 'bitscore', 'sequence']]
return df
开发者ID:mortonjt,项目名称:micronota,代码行数:50,代码来源:diamond.py
注:本文中的skbio.read函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论