本文整理汇总了Python中utils.is_normed函数的典型用法代码示例。如果您正苦于以下问题:Python is_normed函数的具体用法?Python is_normed怎么用?Python is_normed使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了is_normed函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: read_erosion_info
def read_erosion_info(self, this_gene, approved_genes=None):
# NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
if approved_genes == None:
approved_genes = [this_gene]
genes_used = set()
for erosion in utils.real_erosions + utils.effective_erosions:
if erosion[0] != self.region:
continue
self.erosion_probs[erosion] = {}
deps = utils.column_dependencies[erosion + "_del"]
with opener("r")(
self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps)
) as infile:
reader = csv.DictReader(infile)
for line in reader:
# first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
if (
self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes
): # NOTE you'll need to change this if you want it to depend on another region's genes
continue
# then skip nonsense erosions that're too long for this gene, but were ok for another
if int(line[erosion + "_del"]) >= len(self.germline_seq):
continue
# then add in this erosion's counts
n_eroded = int(line[erosion + "_del"])
if n_eroded not in self.erosion_probs[erosion]:
self.erosion_probs[erosion][n_eroded] = 0.0
self.erosion_probs[erosion][n_eroded] += float(line["count"])
if self.region + "_gene" in line:
genes_used.add(line[self.region + "_gene"])
assert len(self.erosion_probs[erosion]) > 0
# do some smoothingy things NOTE that we normalize *after* interpolating
if (
erosion in utils.real_erosions
): # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
n_max = self.n_max_to_interpolate
else: # for fake erosions, always interpolate
n_max = -1
# print ' interpolate erosions'
interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
self.add_pseudocounts(self.erosion_probs[erosion])
# and finally, normalize
total = 0.0
for _, val in self.erosion_probs[erosion].iteritems():
total += val
test_total = 0.0
for n_eroded in self.erosion_probs[erosion]:
self.erosion_probs[erosion][n_eroded] /= total
test_total += self.erosion_probs[erosion][n_eroded]
assert utils.is_normed(test_total)
if len(genes_used) > 1: # if length is 1, we will have just used the actual gene
if self.args.debug:
print " erosions used:", " ".join(genes_used)
开发者ID:stevenweaver,项目名称:partis,代码行数:60,代码来源:hmmwriter.py
示例2: read_mute_freqs
def read_mute_freqs(self, mute_freq_dir):
# NOTE these are mute freqs, not branch lengths, but it's ok for now
for mtype in ['all',] + utils.regions:
infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
self.branch_lengths[mtype] = {}
self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
mutehist = plotting.make_hist_from_bin_entry_file(infname, mtype+'-mute-freqs')
self.branch_lengths[mtype]['mean'] = mutehist.GetMean()
if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
print 'WARNING nonzero under/overflow bins read from %s' % infname
check_sum = 0.0
for ibin in range(1, mutehist.GetNbinsX()+1): # ignore under/overflow bins
freq = mutehist.GetBinCenter(ibin)
branch_length = float(freq)
prob = mutehist.GetBinContent(ibin)
self.branch_lengths[mtype]['lengths'].append(branch_length)
self.branch_lengths[mtype]['probs'].append(prob)
check_sum += self.branch_lengths[mtype]['probs'][-1]
assert utils.is_normed(check_sum)
if self.args.debug:
print ' mean branch lengths'
for mtype in ['all',] + utils.regions:
print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
开发者ID:matsengrp,项目名称:bioboxpartis,代码行数:26,代码来源:treegenerator.py
示例3: read_mute_freqs
def read_mute_freqs(self, mute_freq_dir):
# NOTE these are mute freqs, not branch lengths, but it's ok for now
for mtype in ['all',] + utils.regions:
infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
self.branch_lengths[mtype] = {}
self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
mutehist = Hist(fname=infname)
self.branch_lengths[mtype]['mean'] = mutehist.get_mean()
# if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
# print 'WARNING nonzero under/overflow bins read from %s' % infname
mutehist.normalize(include_overflows=False, overflow_eps_to_ignore=1e-2) # if it was written with overflows included, it'll need to be renormalized
check_sum = 0.0
for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins
freq = mutehist.get_bin_centers()[ibin]
branch_length = self.convert_observed_changes_to_branch_length(float(freq))
prob = mutehist.bin_contents[ibin]
self.branch_lengths[mtype]['lengths'].append(branch_length)
self.branch_lengths[mtype]['probs'].append(prob)
check_sum += self.branch_lengths[mtype]['probs'][-1]
if not utils.is_normed(check_sum):
raise Exception('not normalized %f' % check_sum)
if self.args.debug:
print ' mean branch lengths'
for mtype in ['all',] + utils.regions:
print ' %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
开发者ID:antibodyome,项目名称:partis,代码行数:27,代码来源:treegenerator.py
示例4: write_mute_freqs
def write_mute_freqs(self, region, gene_or_insert_name, seq, reco_event, reco_seq_fname, is_insertion=False):
""" Read position-by-position mute freqs from disk for <gene_or_insert_name>, renormalize, then write to a file for bppseqgen. """
mute_freqs = self.get_mute_freqs(gene_or_insert_name)
rates = [] # list with a relative mutation rate for each position in <seq>
total = 0.0
# assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to...
left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p']
for inuke in range(len(seq)): # append a freq for each nuke
position = inuke + left_erosion_length
freq = 0.0
if position in mute_freqs:
freq = mute_freqs[position]
else:
freq = mute_freqs['overall_mean']
rates.append(freq)
total += freq
# normalize to the number of sites (i.e. so an average site is given value 1.0)
assert total != 0.0 # I am not hip enough to divide by zero
for inuke in range(len(seq)):
rates[inuke] *= float(len(seq)) / total
total = 0.0
# and... double check it, just for shits and giggles
for inuke in range(len(seq)):
total += rates[inuke]
assert utils.is_normed(total / float(len(seq)))
assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking?
# write the input file for bppseqgen, one base per line
with opener('w')(reco_seq_fname) as reco_seq_file:
reco_seq_file.write('state\trate\n')
for inuke in range(len(seq)):
reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
开发者ID:Annak17,项目名称:partis,代码行数:35,代码来源:recombinator.py
示例5: read_mute_freqs
def read_mute_freqs(self, parameter_dir):
# NOTE these are mute freqs, not branch lengths, but it's ok for now
branch_lengths = {}
for mtype in ['all',] + utils.regions:
branch_lengths[mtype] = {n : [] for n in ('lengths', 'probs')}
mutehist = self.get_mute_hist(mtype, parameter_dir)
branch_lengths[mtype]['mean'] = mutehist.get_mean()
mutehist.normalize(include_overflows=False, expect_overflows=True) # if it was written with overflows included, it'll need to be renormalized
check_sum = 0.0
for ibin in range(1, mutehist.n_bins + 1): # ignore under/overflow bins
freq = mutehist.get_bin_centers()[ibin]
branch_length = self.convert_observed_changes_to_branch_length(float(freq))
prob = mutehist.bin_contents[ibin]
branch_lengths[mtype]['lengths'].append(branch_length)
branch_lengths[mtype]['probs'].append(prob)
check_sum += branch_lengths[mtype]['probs'][-1]
if not utils.is_normed(check_sum):
raise Exception('not normalized %f' % check_sum)
if self.args.debug:
print ' mean branch lengths'
for mtype in ['all',] + utils.regions:
print ' %4s %7.3f (ratio %7.3f)' % (mtype, branch_lengths[mtype]['mean'], branch_lengths[mtype]['mean'] / branch_lengths['all']['mean'])
return branch_lengths
开发者ID:Annak17,项目名称:partis,代码行数:26,代码来源:treegenerator.py
示例6: read_vdj_version_freqs
def read_vdj_version_freqs(self):
""" Read the frequencies at which various VDJ combinations appeared in data """
if self.args.rearrange_from_scratch:
return None
version_freq_table = {}
with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile:
in_data = csv.DictReader(infile)
total = 0.0
for line in in_data: # NOTE do *not* assume the file is sorted
skip = False
for region in utils.regions:
if line[region + '_gene'] not in self.glfo['seqs'][region]:
skip = True
break
if skip:
continue
total += float(line['count'])
index = self.freqtable_index(line)
assert index not in version_freq_table
version_freq_table[index] = float(line['count'])
if len(version_freq_table) == 0:
raise Exception('didn\'t find any gene combinations in %s' % fname)
# then normalize
test_total = 0.0
for index in version_freq_table:
version_freq_table[index] /= total
test_total += version_freq_table[index]
assert utils.is_normed(test_total, this_eps=1e-8)
assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
return version_freq_table
开发者ID:psathyrella,项目名称:partis,代码行数:33,代码来源:recombinator.py
示例7: read_erosion_info
def read_erosion_info(self, this_gene, approved_genes=None):
# NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion
if approved_genes is None:
approved_genes = [this_gene, ]
eprobs = {}
genes_used = set()
for erosion in utils.real_erosions + utils.effective_erosions:
if erosion[0] != self.region:
continue
eprobs[erosion] = {}
if this_gene == glutils.dummy_d_genes[self.args.chain]:
eprobs[erosion][0] = 1. # always erode zero bases
continue
deps = utils.column_dependencies[erosion + '_del']
with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile:
reader = csv.DictReader(infile)
for line in reader:
# first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes
continue
# then skip nonsense erosions that're too long for this gene, but were ok for another
if int(line[erosion + '_del']) >= len(self.germline_seq):
continue
# then add in this erosion's counts
n_eroded = int(line[erosion + '_del'])
if n_eroded not in eprobs[erosion]:
eprobs[erosion][n_eroded] = 0.0
eprobs[erosion][n_eroded] += float(line['count'])
if self.region + '_gene' in line:
genes_used.add(line[self.region + '_gene'])
if len(eprobs[erosion]) == 0:
raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)))
# do some smoothingy things NOTE that we normalize *after* interpolating
if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero)
n_max = self.n_max_to_interpolate
else: # for fake erosions, always interpolate
n_max = -1
# print ' interpolate erosions'
interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq))
self.add_pseudocounts(eprobs[erosion])
# and finally, normalize
total = 0.0
for _, val in eprobs[erosion].iteritems():
total += val
test_total = 0.0
for n_eroded in eprobs[erosion]:
eprobs[erosion][n_eroded] /= total
test_total += eprobs[erosion][n_eroded]
assert utils.is_normed(test_total)
if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene
print ' used erosion info from:', ' '.join(genes_used)
return eprobs
开发者ID:psathyrella,项目名称:partis,代码行数:60,代码来源:hmmwriter.py
示例8: read_insertion_content
def read_insertion_content(self, insertion):
icontentprobs = {} # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function
if insertion in utils.boundaries: # i.e. if it's a real insertion
with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile:
reader = csv.DictReader(icfile)
total = 0
for line in reader:
icontentprobs[line[insertion + '_insertion_content']] = int(line['count'])
total += int(line['count'])
if total == 0. and self.debug:
print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv'
for nuke in utils.nukes:
if total == 0.:
icontentprobs[nuke] = 1. / len(utils.nukes)
else:
if nuke not in icontentprobs:
print ' %s not in insertion content probs, adding with zero' % nuke
icontentprobs[nuke] = 0
icontentprobs[nuke] /= float(total)
else: # just return uniform probs for effective (fv and jf) insertions
icontentprobs = {n : 0.25 for n in utils.nukes}
assert utils.is_normed(icontentprobs)
return icontentprobs
开发者ID:psathyrella,项目名称:partis,代码行数:26,代码来源:hmmwriter.py
示例9: read_insertion_content
def read_insertion_content(self, insertion):
self.insertion_content_probs[insertion] = {}
if insertion in utils.boundaries: # just return uniform probs for fv and jf insertions
with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile:
reader = csv.DictReader(icfile)
total = 0
for line in reader:
self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count'])
total += int(line['count'])
if total == 0.:
print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv'
for nuke in utils.nukes:
if total == 0.:
self.insertion_content_probs[insertion][nuke] = 1. / len(utils.nukes)
else:
if nuke not in self.insertion_content_probs[insertion]:
print ' %s not in insertion content probs, adding with zero' % nuke
self.insertion_content_probs[insertion][nuke] = 0
self.insertion_content_probs[insertion][nuke] /= float(total)
else:
self.insertion_content_probs[insertion] = {n : 0.25 for n in utils.nukes}
assert utils.is_normed(self.insertion_content_probs[insertion])
if self.args.debug:
print ' insertion content for', insertion, self.insertion_content_probs[insertion]
开发者ID:Annak17,项目名称:partis,代码行数:25,代码来源:hmmwriter.py
示例10: read_vdj_version_freqs
def read_vdj_version_freqs(self, fname):
""" Read the frequencies at which various VDJ combinations appeared in data """
with opener('r')(fname) as infile:
in_data = csv.DictReader(infile)
total = 0.0
for line in in_data:
# NOTE do *not* assume the file is sorted
#
# if int(line['cdr3_length']) == -1:
# continue # couldn't find conserved codons when we were inferring things
if self.args.only_genes is not None: # are we restricting ourselves to a subset of genes?
if line['v_gene'] not in self.args.only_genes:
continue
if line['d_gene'] not in self.args.only_genes:
continue
if line['j_gene'] not in self.args.only_genes:
continue
total += float(line['count'])
index = tuple(line[column] for column in utils.index_columns)
assert index not in self.version_freq_table
self.version_freq_table[index] = float(line['count'])
if len(self.version_freq_table) == 0:
print 'ERROR didn\'t find any matching gene combinations'
assert False
# then normalize
test_total = 0.0
for index in self.version_freq_table:
self.version_freq_table[index] /= total
test_total += self.version_freq_table[index]
assert utils.is_normed(test_total, this_eps=1e-8)
assert len(self.version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
开发者ID:antibodyome,项目名称:partis,代码行数:33,代码来源:recombinator.py
示例11: read_insertion_info
def read_insertion_info(self, this_gene, approved_genes=None):
if approved_genes == None: # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm
approved_genes = [this_gene,]
genes_used = set()
for insertion in self.insertions:
self.insertion_probs[insertion] = {}
deps = utils.column_dependencies[insertion + '_insertion']
with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile:
reader = csv.DictReader(infile)
for line in reader:
# first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version)
if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes
continue
# then add in this insertion's counts
n_inserted = 0
n_inserted = int(line[insertion + '_insertion'])
if n_inserted not in self.insertion_probs[insertion]:
self.insertion_probs[insertion][n_inserted] = 0.0
self.insertion_probs[insertion][n_inserted] += float(line['count'])
if self.region + '_gene' in line:
genes_used.add(line[self.region + '_gene'])
assert len(self.insertion_probs[insertion]) > 0
# print ' interpolate insertions'
interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this
if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information
if self.args.debug:
print ' WARNING adding pseudocount to 1-bin in insertion probs'
self.insertion_probs[insertion][0] = 1
self.insertion_probs[insertion][1] = 1
if self.args.debug:
print ' ', self.insertion_probs[insertion]
assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information
# and finally, normalize
total = 0.0
for _, val in self.insertion_probs[insertion].iteritems():
total += val
test_total = 0.0
for n_inserted in self.insertion_probs[insertion]:
self.insertion_probs[insertion][n_inserted] /= total
test_total += self.insertion_probs[insertion][n_inserted]
assert utils.is_normed(test_total)
if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0:
print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion]
assert False
# self.insertion_content_probs = {}
self.read_insertion_content(insertion) # also read the base content of the insertions
if len(genes_used) > 1: # if length is 1, we will have just used the actual gene
if self.args.debug:
print ' insertions used:', ' '.join(genes_used)
开发者ID:Annak17,项目名称:partis,代码行数:60,代码来源:hmmwriter.py
示例12: check
def check(self):
total = 0.0
for _, prob in self.transitions.iteritems():
assert prob >= 0.0
total += prob
if not utils.is_normed(total):
raise Exception('transition probs not normed in %s: %s' % (self.name, self.transitions))
if self.name == 'init': # no emissions for 'init' state
return
if self.emissions is not None:
total = 0.0
for _, prob in self.emissions['probs'].iteritems():
assert prob >= 0.0
total += prob
assert utils.is_normed(total)
开发者ID:Annak17,项目名称:partis,代码行数:17,代码来源:hmmwriter.py
示例13: add_region_entry_transitions
def add_region_entry_transitions(self, state, insertion):
"""
Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state.
For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
For d and j, this is (mostly) the prob to actually erode on the left side.
The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
"""
assert 'jf' not in insertion # need these to only be *left*-hand insertions
assert state.name == 'init' or 'insert' in state.name
# first add transitions to the insert state
region_entry_prob = 0.0 # Prob to go to an internal germline state (i.e. not to an insert state)
if state.name == 'init':
if insertion == '':
region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
else:
region_entry_prob = self.get_zero_length_insertion_prob(insertion) # prob of entering the region from 'init' is the prob of a zero-length insertion
elif 'insert' in state.name:
region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion) # the 'insert_left' state has to either go to itself, or else enter the region
else:
assert False
# If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
# Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
# update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
if insertion != '':
if not insertion in utils.boundaries:
nukelist = ['N', ]
else:
nukelist = utils.nukes
for nuke in nukelist:
content_prob = 1. if nuke == 'N' else self.insertion_content_probs[insertion][nuke]
state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * content_prob)
# then add transitions to the region's internal states
total = 0.0
if self.region == 'v': # only add a transition to the zeroth internal state
state.add_transition('%s_%d' % (self.saniname, 0), region_entry_prob)
total += region_entry_prob
self.smallest_entry_index = 0
else:
erosion = self.region + '_5p'
for inuke in range(len(self.germline_seq)):
erosion_length = inuke
if erosion_length in self.erosion_probs[erosion]:
prob = self.erosion_probs[erosion][erosion_length]
total += prob * region_entry_prob
if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state
state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob)
if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index: # tells us where we need to start adding internal states (the smallest internal state index we add is the first one that has nonzero transition probability here)
self.smallest_entry_index = inuke
else:
assert state.name == 'init' # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state
if region_entry_prob != 0.0 and not utils.is_normed(total / region_entry_prob):
raise Exception('normalization problem in add_region_entry_transitions():\n region_entry_prob: %f total / region_entry_prob: %f' % (region_entry_prob, total / region_entry_prob))
开发者ID:Annak17,项目名称:partis,代码行数:56,代码来源:hmmwriter.py
示例14: choose_allele_prevalence_freqs
def choose_allele_prevalence_freqs(glfo, allele_prevalence_freqs, region, min_allele_prevalence_freq, debug=False):
n_alleles = len(glfo["seqs"][region])
prevalence_counts = numpy.random.randint(
1, int(1.0 / min_allele_prevalence_freq), size=n_alleles
) # ensures that each pair of alleles has a prevalence ratio between <min_allele_prevalence_freq> and 1. NOTE it's inclusive
prevalence_freqs = [float(c) / sum(prevalence_counts) for c in prevalence_counts]
allele_prevalence_freqs[region] = {g: f for g, f in zip(glfo["seqs"][region].keys(), prevalence_freqs)}
assert utils.is_normed(allele_prevalence_freqs[region])
if debug:
print " counts %s" % " ".join([("%5d" % c) for c in prevalence_counts])
print " freqs %s" % " ".join([("%5.3f" % c) for c in prevalence_freqs])
print " min ratio %.3f" % (min(prevalence_freqs) / max(prevalence_freqs))
开发者ID:psathyrella,项目名称:partis,代码行数:12,代码来源:glutils.py
示例15: read_allele_prevalence_freqs
def read_allele_prevalence_freqs(fname, debug=False):
# NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense
allele_prevalence_freqs = {r: {} for r in utils.regions}
with open(fname) as pfile:
reader = csv.DictReader(pfile)
for line in reader:
allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"])
for region in utils.regions:
if len(allele_prevalence_freqs[region]) == 0:
continue
if debug:
for gene, freq in allele_prevalence_freqs[region].items():
print "%14.8f %s" % (freq, utils.color_gene(gene))
assert utils.is_normed(allele_prevalence_freqs[region])
return allele_prevalence_freqs
开发者ID:psathyrella,项目名称:partis,代码行数:15,代码来源:glutils.py
示例16: check
def check(self):
total = 0.0
for _, prob in self.transitions.iteritems():
assert prob >= 0.0
total += prob
assert utils.is_normed(total)
if self.name == 'init': # no emissions for 'init' state
return
if self.emissions is not None:
total = 0.0
for _, prob in self.emissions['probs'].iteritems():
assert prob >= 0.0
total += prob
assert utils.is_normed(total)
if self.pair_emissions is not None:
total = 0.0
for letter1 in self.pair_emissions['probs']:
for _, prob in self.pair_emissions['probs'][letter1].iteritems():
assert prob >= 0.0
total += prob
assert utils.is_normed(total)
开发者ID:matsengrp,项目名称:bioboxpartis,代码行数:24,代码来源:hmmwriter.py
示例17: add_region_entry_transitions
def add_region_entry_transitions(self, state, insertion):
"""
Add transitions *into* the v, d, or j regions. Called from either the 'init' state or the 'insert_left' state.
For v, this is (mostly) the prob that the read doesn't extend all the way to the left side of the v gene.
For d and j, this is (mostly) the prob to actually erode on the left side.
The two <mostly>s are there because in both cases, we're starting from *approximate* smith-waterman alignments, so we need to add some fuzz in case the s-w is off.
"""
assert 'jf' not in insertion # need these to only be *left*-hand insertions
assert state.name == 'init' or 'insert' in state.name
region_entry_prob = 0.0 # Prob to go directly into the region (i.e. with no insertion)
# The sum of the region entry probs must be (1 - non_zero_insertion_prob) for d and j
# (i.e. such that [prob of transitions to insert] + [prob of transitions *not* to insert] is 1.0)
# first add transitions to the insert state
if state.name == 'init':
if insertion == '':
region_entry_prob = 1.0 # if no insert state on this side (i.e. we're on left side of v), we have no choice but to enter the region (the internal states)
else:
region_entry_prob = self.insertion_probs[insertion][0] # prob of entering the region from 'init' is the prob of a zero-length insertion
elif 'insert' in state.name:
region_entry_prob = 1.0 - self.get_insert_self_transition_prob(insertion) # the 'insert_left' state has to either go to itself, or else enter the region
else:
assert False
# If this is an 'init' state, we add a transition to 'insert' with probability the observed probability of a non-zero insertion
# Whereas if this is an 'insert' state, we add a *self*-transition with probability 1/<mean observed insert length>
# update: now, we also multiply by the insertion content prob, since we now have four insert states (and can thus no longer use this prob in the emissions)
if insertion != '':
for nuke in utils.nukes:
state.add_transition('insert_left_' + nuke, (1.0 - region_entry_prob) * self.insertion_content_probs[insertion][nuke])
# then add transitions to the region's internal states
erosion = self.region + '_5p'
total = 0.0
for inuke in range(len(self.germline_seq)):
erosion_length = inuke
if erosion_length in self.erosion_probs[erosion]:
prob = self.erosion_probs[erosion][erosion_length]
total += prob * region_entry_prob
if region_entry_prob != 0.0: # only add the line if there's a chance of entering the region from this state
state.add_transition('%s_%d' % (self.saniname, inuke), prob * region_entry_prob)
if self.smallest_entry_index == -1 or inuke < self.smallest_entry_index:
self.smallest_entry_index = inuke
else:
assert state.name == 'init' # if there's *no* chance of entering the region, this better *not* be the 'insert_left' state
assert region_entry_prob == 0.0 or utils.is_normed(total / region_entry_prob)
开发者ID:matsengrp,项目名称:bioboxpartis,代码行数:48,代码来源:hmmwriter.py
示例18: read_insertion_content
def read_insertion_content(self):
self.insertion_content_probs = {}
for bound in utils.boundaries:
self.insertion_content_probs[bound] = {}
with opener('r')(self.args.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile:
reader = csv.DictReader(icfile)
total = 0
for line in reader:
self.insertion_content_probs[bound][line[bound + '_insertion_content']] = int(line['count'])
total += int(line['count'])
for nuke in utils.nukes:
if nuke not in self.insertion_content_probs[bound]:
print ' %s not in insertion content probs, adding with zero' % nuke
self.insertion_content_probs[bound][nuke] = 0
self.insertion_content_probs[bound][nuke] /= float(total)
assert utils.is_normed(self.insertion_content_probs[bound])
开发者ID:antibodyome,项目名称:partis,代码行数:17,代码来源:recombinator.py
示例19: normalize
def normalize(self):
sum_value = 0.0
for ib in range(1, self.n_bins + 1): # don't include under/overflows in sum_value
sum_value += self.bin_contents[ib]
if sum_value == 0.0:
print 'WARNING sum zero in Hist::normalize, returning without doing anything'
return
# make sure there's not too much stuff in the under/overflows
if self.bin_contents[0]/sum_value > 1e-10 or self.bin_contents[self.n_bins+1]/sum_value > 1e-10:
print 'WARNING under/overflows'
for ib in range(1, self.n_bins + 1):
self.bin_contents[ib] /= sum_value
if self.sum_weights_squared is not None:
self.sum_weights_squared[ib] /= sum_value*sum_value
check_sum = 0.0
for ib in range(1, self.n_bins + 1): # check it
check_sum += self.bin_contents[ib]
assert is_normed(check_sum, this_eps=1e-10)
开发者ID:matsengrp,项目名称:bioboxpartis,代码行数:18,代码来源:hist.py
|
请发表评论