def _generate_alpha_diversity_boxplots(collated_adiv_dir, map_fp,
split_category, comparison_category,
rarefaction_depth, output_dir):
"""Generates per-body-site self vs. other alpha diversity boxplots.
Creates a plot for each input collated alpha diversity file (i.e. metric)
in collated_adiv_dir. Returns a list of plot filenames that were created in
output_dir.
Arguments:
collated_adiv_dir - path to directory containing one or more collated
alpha diversity files
map_fp - filepath to metadata mapping file
split_category - category to split on, e.g. body site. A boxplot will
be created for each category value (e.g. tongue, palm, etc.)
comparison_category - category to split on within each of the split
categories (e.g. self, other)
rarefaction_depth - rarefaction depth to use when pulling data from
rarefaction files
output_dir - directory to write output plot images to
"""
metadata_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
collated_adiv_fps = glob(join(collated_adiv_dir, '*.txt'))
plot_title = 'Alpha diversity (%d seqs/sample)' % rarefaction_depth
# Generate a plot for each collated alpha diversity metric file.
created_files = []
for collated_adiv_fp in collated_adiv_fps:
adiv_metric = splitext(basename(collated_adiv_fp))[0]
x_tick_labels, dists = _collect_alpha_diversity_boxplot_data(
open(collated_adiv_fp, 'U'), metadata_map, rarefaction_depth,
split_category, comparison_category)
plot_figure = generate_box_plots(dists,
x_tick_labels=x_tick_labels,
title=plot_title,
x_label='Grouping',
y_label=format_title(adiv_metric))
plot_fp = join(output_dir, '%s.png' % adiv_metric)
plot_figure.savefig(plot_fp)
created_files.append(basename(plot_fp))
return created_files
def subset_groups(dm_f, map_f, category, max_group_size):
dm_labels, dm_data = parse_distmat(dm_f)
metadata_map = MetadataMap.parseMetadataMap(map_f)
category_map = defaultdict(list)
for samp_id in metadata_map.SampleIds:
# Mapping files can have more samples than distance matrices, which can
# happen in this case since we are dealing with rarefied OTU tables
# (samples get dropped).
if samp_id in dm_labels:
category_val = metadata_map.getCategoryValue(samp_id, category)
category_map[category_val].append(samp_id)
samp_ids_to_keep = []
for category_val, samp_ids in category_map.items():
samp_ids_to_keep.extend(
sample(samp_ids, min(max_group_size, len(samp_ids))))
return filter_samples_from_distance_matrix((dm_labels, dm_data),
samp_ids_to_keep, negate=True)
def check_mapping_file_category(loaded_biom, mapping_fp, mapping_category, subcategory_1, subcategory_2):
#remove mapping file samples that are not in the input BIOM table
with open(mapping_fp, 'U') as map_f:
md_map = MetadataMap.parseMetadataMap(map_f)
md_map.filterSamples(loaded_biom.ids(axis='sample'), strict=True)
if mapping_category not in md_map.CategoryNames:
raise ValueError("category '%s' not found in mapping file "
"columns." % mapping_category)
all_subcategories = md_map.getCategoryValues(md_map.sample_ids, mapping_category)
if subcategory_1 not in all_subcategories:
raise ValueError("subcategory_1 (-x) '%s' not found in selected "
"mapping file column." % subcategory_1)
if subcategory_2 not in all_subcategories:
raise ValueError("subcategory_2 (-y) '%s' not found in selected "
"mapping file column." % subcategory_2)
if subcategory_2 == subcategory_1:
raise ValueError("subcategory_1 (-x) must be different from subcategory_2 (-y)")
def sample_ids_from_category_state_coverage(mapping_f,
coverage_category,
subject_category,
min_num_states=None,
required_states=None,
considered_states=None,
splitter_category=None):
"""Filter sample IDs based on subject's coverage of a category.
Given a category that groups samples by subject (subject_category), samples
are filtered by how well a subject covers (i.e. has at least one sample
for) the category states in coverage_category.
Two filtering criteria are provided (min_num_states and required_states). At
least one must be provided. If both are provided, the subject must meet
both criteria to pass the filter (i.e. providing both filters is an AND,
not an OR, operation).
A common use case is to provide a 'time' category for coverage_category and
an 'individual' category for subject_category in order to filter out
individuals from a study that do not have samples for some minimum number
of timepoints (min_num_states) and that do not have samples for certain
timepoints (required_states). For example, this could be the first and last
timepoints in the study.
Returns a set of sample IDs to keep, the number of subjects that were
kept, and a set of the unique category states in coverage_category that
were kept. The set of sample IDs is not guaranteed to be in any specific
order relative to the order of sample IDs or subjects in the mapping file.
Arguments:
mapping_f - metadata mapping file (file-like object)
coverage_category - category to test subjects' coverage (string)
subject_category - category to group samples by subject (string)
min_num_states - minimum number of category states in coverage_category
that a subject must cover (i.e. have at least one sample for) to be
included in results (integer)
required_states - category states in coverage_category that must be
covered by a subject's samples in order to be included in results
(list of strings or items that can be converted to strings)
considered_states - category states that are counted toward the
min_num_states (list of strings or items that can be converted to
strings)
splitter_category - category to split input mapping file on prior to
processing. If not supplied, the mapping file will not be split. If
supplied, a dictionary mapping splitter_category state to results
will be returned instead of the three-element tuple. The supplied
filtering criteria will apply to each split piece of the mapping
file independently (e.g. if an individual passes the filters for
the tongue samples, his/her tongue samples will be included for
the tongue results, even if he/she doesn't pass the filters for the
palm samples)
"""
metadata_map = MetadataMap.parseMetadataMap(mapping_f)
# Make sure our input looks sane.
categories_to_test = [coverage_category, subject_category]
if splitter_category is not None:
categories_to_test.append(splitter_category)
if 'SampleID' in categories_to_test:
raise ValueError("The 'SampleID' category is not suitable for use in "
"this function. Please choose a different category "
"from the metadata mapping file.")
for category in categories_to_test:
if category not in metadata_map.CategoryNames:
raise ValueError("The category '%s' is not in the metadata "
"mapping file." % category)
if len(set(categories_to_test)) < len(categories_to_test):
raise ValueError("The coverage, subject, and (optional) splitter "
"categories must all be unique.")
if required_states is not None:
# required_states must be in coverage_category's states in the mapping
# file.
required_states = set(map(str,required_states))
valid_coverage_states = set(metadata_map.getCategoryValues(
metadata_map.SampleIds, coverage_category))
invalid_coverage_states = required_states - valid_coverage_states
if invalid_coverage_states:
raise ValueError("The category state(s) '%s' are not in the '%s' "
"category in the metadata mapping file." %
(', '.join(invalid_coverage_states),
coverage_category))
if considered_states is not None:
# considered_states is not as restrictive as required_states - we don't
# require that these are present, so it's OK if some of the states
# listed here don't actually show up in the mapping file (allowing
# the user to pass something like range(100) to consider only states
# that fall in some range)
considered_states = set(map(str,considered_states))
# define a function to determine if a state should be considered
consider_state = lambda s: s in considered_states
else:
# define a dummy function to consider all states (the default
# if the user does not provide a list of considered_states)
#.........这里部分代码省略.........
if '&&' in col:
for _col in col.split('&&'):
if _col not in lookup_header:
offending_fields.append(col)
elif col not in lookup_header:
offending_fields.append(col)
else:
# if the user didn't specify the header names display everything
color_by_column_names = header[:]
# extract a list of the custom axes provided and each element is numeric
if custom_axes:
custom_axes = custom_axes.strip().strip("'").strip('"').split(',')
# the MetadataMap object makes some checks easier
map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), [])
for axis in custom_axes:
# append the field to the error queue that it belongs to
if axis not in lookup_header:
offending_fields.append(axis)
break
# make sure this value is in the mapping file
elif axis not in color_by_column_names:
color_by_column_names.append(axis)
# perform only if the for loop does not call break
else:
# make sure all these axes are numeric
for axis in custom_axes:
if map_object.isNumericCategory(axis) == False:
non_numeric_categories.append(axis)
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
"""Runs the specified statistical method using the category of interest.
This method does not return anything; all output is written to results
files in out_dir.
Arguments:
dm_fp - filepath to the input distance matrix
map_fp - filepath to the input metadata mapping file
categories - list of categories in the metadata mapping file to
consider in the statistical test. Multiple categories will only be
considered if method is 'best', otherwise only the first category
will be considered
num_perms - the number of permutations to use when calculating the
p-value. If method is 'best' or 'morans_i', this parameter will be
ignored as they are not permutation-based methods
out_dir - path to the output directory where results files will be
written. It is assumed that this directory already exists and we
have write permissions to it
"""
# Make sure we were passed a list of categories, not a single string.
if not isinstance(categories, ListType):
raise TypeError("The supplied categories must be a list of "
"strings.")
# Special case: we do not allow SampleID as it is not a category, neither
# in data structure representation nor in terms of a statistical test (no
# groups are formed since all entries are unique IDs).
if 'SampleID' in categories:
raise ValueError("Cannot use SampleID as a category because it is a "
"unique identifier for each sample, and thus does "
"not create groups of samples (nor can it be used as "
"a numeric category in Moran's I or BEST analyses). "
"Please use a different metadata column to perform "
"statistical tests on.")
# Parse the mapping file and distance matrix.
with open(map_fp, 'U') as map_f:
md_map = MetadataMap.parseMetadataMap(map_f)
with open(dm_fp, 'U') as dm_f:
dm = SymmetricDistanceMatrix.from_file(dm_f)
# Remove any samples from the mapping file that aren't in the distance
# matrix (important for validation checks). Use strict=True so that an
# error is raised if the distance matrix contains any samples that aren't
# in the mapping file.
md_map.filterSamples(dm.sample_ids, strict=True)
# Run the specified statistical method.
if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
# These methods are run in R. Input validation must be done here before
# running the R commands. The pure-Python implementations perform all
# validation in the classes in the stats module.
# Check to make sure all categories passed in are in mapping file and
# are not all the same value.
for category in categories:
if not category in md_map.CategoryNames:
raise ValueError("Category '%s' not found in mapping file "
"columns." % category)
if md_map.hasSingleCategoryValue(category):
raise ValueError("All values in category '%s' are the "
"same. The statistical method '%s' cannot "
"operate on a category that creates only "
"a single group of samples (e.g. there "
"are no 'between' distances because "
"there is only a single group)."
% (category, method))
# Build the command arguments string.
command_args = ['-d %s -m %s -c %s -o %s'
% (dm_fp, map_fp, categories[0], out_dir)]
if method == 'morans_i':
# Moran's I requires only numeric categories.
for category in categories:
if not md_map.isNumericCategory(category):
raise TypeError("The category '%s' is not numeric. Not "
"all values could be converted to numbers."
% category)
else:
# The rest require groups of samples, so the category values cannot
# all be unique.
for category in categories:
if md_map.hasUniqueCategoryValues(category):
raise ValueError("All values in category '%s' are unique. "
"This statistical method cannot operate "
"on a category with unique values (e.g. "
"there are no 'within' distances because "
"each group of samples contains only a "
"single sample)." % category)
# Only Moran's I doesn't accept a number of permutations.
if num_perms < 0:
raise ValueError("The number of permutations must be greater "
"than or equal to zero.")
#.........这里部分代码省略.........
def run_core_diversity_analyses(
biom_fp,
mapping_fp,
sampling_depth,
output_dir,
qiime_config,
command_handler=call_commands_serially,
tree_fp=None,
params=None,
categories=None,
arare_min_rare_depth=10,
arare_num_steps=10,
parallel=False,
suppress_taxa_summary=False,
suppress_beta_diversity=False,
suppress_alpha_diversity=False,
suppress_otu_category_significance=False,
status_update_callback=print_to_stdout):
"""
"""
if categories != None:
# Validate categories provided by the users
mapping_data, mapping_comments = \
parse_mapping_file_to_dict(open(mapping_fp,'U'))
metadata_map = MetadataMap(mapping_data, mapping_comments)
for c in categories:
if c not in metadata_map.CategoryNames:
raise ValueError, ("Category '%s' is not a column header "
"in your mapping file. "
"Categories are case and white space sensitive. Valid "
"choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
if metadata_map.hasSingleCategoryValue(c):
raise ValueError, ("Category '%s' contains only one value. "
"Categories analyzed here require at least two values." % c)
else:
categories= []
# prep some variables
if params == None:
params = parse_qiime_parameters([])
create_dir(output_dir)
index_fp = '%s/index.html' % output_dir
index_links = []
commands = []
# begin logging
old_log_fps = glob(join(output_dir,'log_20*txt'))
log_fp = generate_log_fp(output_dir)
index_links.append(('Master run log',log_fp,_index_headers['run_summary']))
for old_log_fp in old_log_fps:
index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary']))
logger = WorkflowLogger(log_fp,
params=params,
qiime_config=qiime_config)
input_fps = [biom_fp,mapping_fp]
if tree_fp != None:
input_fps.append(tree_fp)
log_input_md5s(logger,input_fps)
# run 'biom summarize-table' on input BIOM table
try:
params_str = get_params_str(params['biom-summarize-table'])
except KeyError:
params_str = ''
biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
if not exists(biom_table_stats_output_fp):
biom_table_summary_cmd = \
"biom summarize-table -i %s -o %s --suppress-md5 %s" % \
(biom_fp, biom_table_stats_output_fp,params_str)
commands.append([('Generate BIOM table summary',
biom_table_summary_cmd)])
else:
logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \
% biom_table_stats_output_fp)
index_links.append(('BIOM table statistics',
biom_table_stats_output_fp,
_index_headers['run_summary']))
# filter samples with fewer observations than the requested sampling_depth.
# since these get filtered for some analyses (eg beta diversity after
# even sampling) it's useful to filter them here so they're filtered
# from all analyses.
filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
if not exists(filtered_biom_fp):
filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
(biom_fp,filtered_biom_fp,sampling_depth)
commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth,
filter_samples_cmd)])
else:
logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \
% filtered_biom_fp)
biom_fp = filtered_biom_fp
# run initial commands and reset the command list
if len(commands) > 0:
command_handler(commands,
status_update_callback,
logger,
#.........这里部分代码省略.........
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
"""Runs the specified statistical method using the category of interest.
This method does not return anything; all output is written to results
files in out_dir.
Arguments:
dm_fp - filepath to the input distance matrix
map_fp - filepath to the input metadata mapping file
categories - list of categories in the metadata mapping file to
consider in the statistical test. Multiple categories will only be
considered if method is 'bioenv', otherwise only the first category
will be considered
num_perms - the number of permutations to use when calculating the
p-value. If method is 'bioenv' or 'morans_i', this parameter will
be ignored as they are not permutation-based methods
out_dir - path to the output directory where results files will be
written. It is assumed that this directory already exists and we
have write permissions to it
"""
# Make sure we were passed a list of categories, not a single string.
if not isinstance(categories, ListType):
raise TypeError("The supplied categories must be a list of "
"strings.")
# Special case: we do not allow SampleID as it is not a category, neither
# in data structure representation nor in terms of a statistical test (no
# groups are formed since all entries are unique IDs).
if 'SampleID' in categories:
raise ValueError("Cannot use SampleID as a category because it is a "
"unique identifier for each sample, and thus does "
"not create groups of samples (nor can it be used as "
"a numeric category in Moran's I or BIO-ENV "
"analyses). Please choose a different metadata "
"column to perform statistical tests on.")
dm = DistanceMatrix.read(dm_fp)
if method in ('anosim', 'permanova', 'bioenv'):
with open(map_fp, 'U') as map_f:
md_dict = parse_mapping_file_to_dict(map_f)[0]
df = pd.DataFrame.from_dict(md_dict, orient='index')
out_fp = join(out_dir, '%s_results.txt' % method)
if method in ('anosim', 'permanova'):
if method == 'anosim':
method_cls = ANOSIM
elif method == 'permanova':
method_cls = PERMANOVA
method_inst = method_cls(dm, df, column=categories[0])
results = method_inst(num_perms)
with open(out_fp, 'w') as out_f:
out_f.write(results.summary())
elif method == 'bioenv':
results = bioenv(dm, df, columns=categories)
results.to_csv(out_fp, sep='\t')
else:
# Remove any samples from the mapping file that aren't in the distance
# matrix (important for validation checks). Use strict=True so that an
# error is raised if the distance matrix contains any samples that
# aren't in the mapping file.
with open(map_fp, 'U') as map_f:
md_map = MetadataMap.parseMetadataMap(map_f)
md_map.filterSamples(dm.ids, strict=True)
# These methods are run in R. Input validation must be done here before
# running the R commands.
if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
# Check to make sure all categories passed in are in mapping file
# and are not all the same value.
for category in categories:
if not category in md_map.CategoryNames:
raise ValueError("Category '%s' not found in mapping file "
"columns." % category)
if md_map.hasSingleCategoryValue(category):
raise ValueError("All values in category '%s' are the "
"same. The statistical method '%s' "
"cannot operate on a category that "
"creates only a single group of samples "
"(e.g. there are no 'between' distances "
"because there is only a single group)."
% (category, method))
# Build the command arguments string.
command_args = ['-d %s -m %s -c %s -o %s'
% (dm_fp, map_fp, categories[0], out_dir)]
if method == 'morans_i':
# Moran's I requires only numeric categories.
for category in categories:
if not md_map.isNumericCategory(category):
raise TypeError("The category '%s' is not numeric. "
"Not all values could be converted to "
"numbers." % category)
else:
# The rest require groups of samples, so the category values
#.........这里部分代码省略.........
def _color_field_states(map_f, samp_ids, field, field_states, color_by_field):
"""Colors one field by another.
Returns a list of matplotlib-compatible colors, one for each of the input
field_states. Also returns a dictionary mapping color_by_field states to
colors (useful for building a legend, for example).
If there are not enough colors available (they are drawn from
qiime.colors.data_colors), an error will be raised as the color mapping
(and legend) will be ambiguous.
A one-to-one mapping must exist between each field_state and its
corresponding color_by field state (otherwise it is unclear which
corresponding color_by field state should be used to color it by). An error
will be raised if this one-to-one mapping does not exist.
Arguments:
map_f - the mapping file (file-like object)
samp_ids - a list of sample IDs to consider in the mapping file. Only
these sample IDs will be used when coloring field states
field - the field in the mapping file to color
field_states - the field states in field to color
color_by_field - the field in the mapping file to color field_states by
"""
colors = []
color_pool = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order]
metadata_map = MetadataMap.parseMetadataMap(map_f)
for field_to_check in field, color_by_field:
if field_to_check not in metadata_map.CategoryNames:
raise ValueError("The field '%s' is not in the metadata mapping " "file's column headers." % field_to_check)
all_field_states = metadata_map.getCategoryValues(samp_ids, field)
all_color_by_states = metadata_map.getCategoryValues(samp_ids, color_by_field)
if len(set(field_states) - set(all_field_states)) != 0:
raise ValueError("Encountered unrecognizable field state(s) in %r " "for field '%s'." % (field_states, field))
# Build mapping from one field to the other.
field_mapping = defaultdict(list)
for field_state, color_by_state in zip(all_field_states, all_color_by_states):
if field_state in field_states:
field_mapping[field_state].append(color_by_state)
# For each of the specified input field states, find its corresponding
# "color by" field state and give it a color if it hasn't been assigned one
# yet. Make sure we have enough colors and there is a one-to-one mapping.
color_mapping = {}
for field_state in field_states:
color_by_states = set(field_mapping[field_state])
if len(color_by_states) != 1:
raise ValueError(
"The field '%s' to color by does not have a "
"one-to-one mapping with field '%s'. Coloring "
"would be ambiguous." % (color_by_field, field)
)
color_by_state = list(color_by_states)[0]
if color_by_state not in color_mapping:
if len(color_pool) > 0:
color_mapping[color_by_state] = color_pool.pop(0)
else:
raise ValueError(
"There are not enough available QIIME colors "
"to color each of the field states in field "
"'%s'. Coloring would be ambiguous." % color_by_field
)
colors.append(color_mapping[color_by_state])
return colors, color_mapping
请发表评论