本文整理汇总了Python中nesoni.io.read_sequences函数的典型用法代码示例。如果您正苦于以下问题:Python read_sequences函数的具体用法?Python read_sequences怎么用?Python read_sequences使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了read_sequences函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: build_snpeff
def build_snpeff(self):
jar = io.find_jar('snpEff.jar')
with open(self/'snpeff.config','wb') as f:
print >> f, 'data_dir = snpeff'
print >> f, 'genomes : ' + self.name
print >> f, self.name + '.genome : ' + self.name
snpwork = io.Workspace(self/'snpeff',must_exist=False)
snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
annotations = self.annotations_filename()
assert annotations
with open(snpwork_genome/'genes.gff','wb') as f:
for record in annotation.read_annotations(annotations):
if record.end <= record.start: continue
if not record.attr:
record.attr['attributes'] = 'none'
print >> f, record.as_gff()
with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
for name, seq in io.read_sequences(self.reference_fasta_filename()):
io.write_fasta(f, name, seq)
io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:nesoni,代码行数:27,代码来源:reference_directory.py
示例2: run
def run(self):
extractions = [ ]
for item in self.genes.split(','):
extraction = item.split('/')
assert len(extraction) == 4
extractions.append(extraction)
rename = { }
if self.rename:
for item in self.rename.split(','):
old,new = item.split('=')
rename[old] = new
work = self.get_workspace()
with workspace.tempspace() as temp:
items = list(annotation.read_annotations(self.annotation))
for item in items:
item.seqid = rename.get(item.seqid, item.seqid)
annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
del items
with open(temp/'temp.fa','wb') as f:
for name,seq in io.read_sequences(self.genome):
name = name.split()[0]
name = rename.get(name,name)
io.write_fasta(f, name, seq)
reference_directory.Make_tt_reference(
self.output_dir,
filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra,
index = self.index, shrimp = self.shrimp,
bowtie = self.bowtie, star = self.star
).run()
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:tail-tools,代码行数:34,代码来源:reference_directory_ensembl.py
示例3: make_file_for_primer_3
def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file):
# check for a tmp direcory
if len(glob.glob("./tmp")) == 0:
call (["mkdir", "tmp"])
gff_file = list(annotation.read_annotations(gff_file))
print "\n Reading in the reference file"
seq_dict = dict(io.read_sequences(ref_file))
names_file = open(names_file).readlines()
config = open("primer_config.txt").readlines()
with open("tmp/regions_" + output_file, 'w') as out_f:
for name in names_file:
sname = name.strip("\n")
found = False
for line in gff_file:
gff_name = line.attr.get ("Name", "No_name")
peak = line.attr.get ("id", "No_id")
if sname in gff_name.split("/"):
out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n")
out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n")
found = True
for cline in config:
out_f.write(cline.strip("\n") + "\n")
out_f.write("=" + "\n")
if found ==False:
print "Could not find the gene " + sname + " in the gff file"
开发者ID:AndrewPattison,项目名称:Python-scripts,代码行数:27,代码来源:get_primers.py
示例4: main
def main(args):
size, args = grace.get_option_value(args,'--size',int,200)
stride, args = grace.get_option_value(args,'--stride',int,50)
grace.expect_no_further_options(args)
if not args:
print USAGE
return 1
for filename in args:
for name, seq in io.read_sequences(filename):
name_parts = name.split(None, 1)
name = name_parts[0]
if len(name_parts) > 1:
desc = ' ' + name_parts[1]
else:
desc = ''
for i in xrange(-size+stride,len(seq),stride):
start = max(0,min(len(seq),i))
end = max(0,min(len(seq), i+size))
io.write_fasta(
sys.stdout,
'%s:%d..%d' % (name,start+1,end) + desc,
seq[start:end]
)
return 0
开发者ID:Slugger70,项目名称:nesoni,代码行数:28,代码来源:shred.py
示例5: run
def run(self):
f = self.begin_output()
for filename in self.filenames:
any = False
name = os.path.splitext(os.path.split(filename)[1])[0]
try:
iterator = io.read_sequences(filename, qualities=True)
except grace.Error:
iterator = None
if iterator is not None:
total = 0
total_length = 0
for seq in io.read_sequences(filename, qualities=True):
total += 1
total_length += len(seq[1])
print >> f, grace.datum(name, 'sequences', total)
print >> f, grace.datum(name, 'average length', float(total_length)/total)
print >> f
any = True
try:
iterator = annotation.read_annotations(filename)
except grace.Error:
iterator = None
if iterator:
total = 0
counts = { }
for item in iterator:
total += 1
counts[item.type] = counts.get(item.type,0)+1
print >> f, grace.datum(name, 'features', total)
for key in sorted(counts):
print >> f, grace.datum(name, key + ' features', counts[key])
print >> f
any = True
if not any:
raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')
self.end_output(f)
开发者ID:Slugger70,项目名称:nesoni,代码行数:46,代码来源:trivia.py
示例6: run
def run(self):
base = os.path.split(self.prefix)[1]
annotations = [ ]
sequences = [ ]
for filename in self.filenames:
any = False
if io.is_sequence_file(filename):
sequences.append(filename)
any = True
if annotation.is_annotation_file(filename):
annotations.append(filename)
any = True
assert any, 'File is neither a recognized sequence or annotation file'
cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
property_filename = os.path.join(self.prefix,'property.txt')
gff_filename = os.path.join(self.prefix,base+'.gff')
output_filenames = [ cytoband_filename, property_filename, gff_filename ]
if not os.path.exists(self.prefix):
os.mkdir(self.prefix)
f = open(property_filename,'wb')
print >> f, 'ordered=true'
print >> f, 'id=%s' % base
print >> f, 'name=%s' % (self.name or base)
print >> f, 'cytobandFile=%s_cytoband.txt' % base
print >> f, 'geneFile=%s.gff' % base
print >> f, 'sequenceLocation=%s' % base
f.close()
trivia.As_gff(output=gff_filename,
filenames=annotations,
exclude=[ 'gene', 'source' ]
).run()
f_cyt = open(cytoband_filename,'wb')
for filename in sequences:
for name, seq in io.read_sequences(filename):
assert '/' not in name
f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
f.write(seq)
f.close()
print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
f_cyt.close()
genome_filename = self.prefix + '.genome'
if os.path.exists(genome_filename):
os.unlink(genome_filename)
io.execute(
['zip', '-j', io.abspath(genome_filename)] +
[ io.abspath(item) for item in output_filenames ]
)
for filename in output_filenames:
if os.path.exists(filename):
os.unlink(filename)
开发者ID:Slugger70,项目名称:nesoni,代码行数:58,代码来源:igv.py
示例7: get_lengths
def get_lengths(self):
#Legacy working directory
if not self.object_exists('reference-lengths.pickle.gz'):
lengths = [ ]
for name, seq in io.read_sequences(self.reference_fasta_filename()):
name = name.split()[0]
lengths.append( (name, len(seq)) )
self.set_object(lengths, 'reference-lengths.pickle.gz')
return self.get_object('reference-lengths.pickle.gz')
开发者ID:Slugger70,项目名称:nesoni,代码行数:10,代码来源:reference_directory.py
示例8: convert
def convert(filename):
info = io.get_file_info(filename)
ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
if ok:
return filename
result_name = tempname()
with open(result_name,'wb') as f:
for name, seq, qual in io.read_sequences(filename, qualities='required'):
io.write_fastq(f, name, seq, qual)
return result_name
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:10,代码来源:bowtie.py
示例9: run
def run(self):
f = self.begin_output()
for filename in self.filenames:
info = io.get_file_info(filename)
any = False
name = os.path.splitext(os.path.split(filename)[1])[0]
if info.matches('sequences'):
total = 0
total_length = 0
for seq in io.read_sequences(filename, qualities=True):
total += 1
total_length += len(seq[1])
print >> f, grace.datum(name, 'sequences', total)
print >> f, grace.datum(name, 'total bases', total_length)
if total:
print >> f, grace.datum(name, 'average length', float(total_length)/total)
print >> f
any = True
if info.matches('annotations'):
total = 0
counts = { }
for item in annotation.read_annotations(filename):
total += 1
counts[item.type] = counts.get(item.type,0)+1
print >> f, grace.datum(name, 'features', total)
for key in sorted(counts):
print >> f, grace.datum(name, key + ' features', counts[key])
print >> f
any = True
if info.matches('type-vcf'):
reader_f = io.open_possibly_compressed_file(filename)
reader = vcf.Reader(reader_f)
n = 0
for item in reader:
n += 1
print >> f, grace.datum(name, 'variants', n)
any = True
if not any:
raise grace.Error('Don\'t know what to do with ' + filename)
self.end_output(f)
开发者ID:drpowell,项目名称:nesoni,代码行数:49,代码来源:trivia.py
示例10: iter_reads
def iter_reads(config, qualities=False):
if 'stride' not in config:
raise grace.Error('Please re-run nesoni shrimp, output format has changed')
stride = config['stride']
for reads_filename_set in config['reads']:
if config['solid']:
reader = [ io.read_solid(filename) for filename in reads_filename_set ]
else:
reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ]
reader = itertools.izip(*reader)
for i, items in enumerate(reader):
if i % stride == 0:
for item in items:
yield item
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:16,代码来源:shrimp.py
示例11: run
def run(self):
workspace = self.get_workspace()
reference = reference_directory.Reference(self.reference, must_exist=True)
reader_f = io.open_possibly_compressed_file(self.vcf)
reader = vcf.Reader(reader_f)
variants = collections.defaultdict(list)
for record in reader:
variants[record.CHROM].append(record)
reader_f.close()
for chrom in variants:
variants[chrom].sort(key=lambda item: item.POS)
filenames = [ workspace/(item+'.fa') for item in reader.samples ]
for filename in filenames:
with open(filename,'wb'): pass
for name, seq in io.read_sequences(reference.reference_fasta_filename()):
for i, sample in enumerate(reader.samples):
revised = [ ]
pos = 0
for variant in variants[name]:
gt = variant.samples[i].data.GT
if gt is None: continue
assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt
gt_number = int(gt)
if gt_number == 0:
var_seq = variant.REF
else:
var_seq = str(variant.ALT[gt_number-1])
assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq
new_pos = variant.POS-1
assert new_pos >= pos, 'Variants overlap.'
revised.append(seq[pos:new_pos])
pos = new_pos
revised.append(var_seq)
assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence'
pos += len(variant.REF)
revised.append(seq[pos:])
with open(filenames[i],'ab') as f:
io.write_fasta(f, name, ''.join(revised))
del variants[name]
assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
开发者ID:simonalpha,项目名称:nesoni,代码行数:47,代码来源:variant.py
示例12: run
def run(self):
f = self.begin_output()
for filename in self.filenames:
for name, seq in io.read_sequences(filename):
name_parts = name.split(None, 1)
name = name_parts[0]
for i in xrange(-self.size+self.stride,len(seq),self.stride):
start = max(0,min(len(seq),i))
end = max(0,min(len(seq), i+self.size))
shred_name = '%s:%d..%d' % (name,start+1,end)
shred_seq = seq
if self.quality:
io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start))
else:
io.write_fasta(f, shred_name, seq[start:end])
self.end_output(f)
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:18,代码来源:shred.py
示例13: debias
def debias(args):
import numpy
radius, args = grace.get_option_value(args, '--radius', int, 2)
dirs = args
for dir_name in dirs:
for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')):
for suffix, ambig_suffix in [
('-depth', '-ambiguous-depth'),
('-pairspan-depth', '-ambiguous-pairspan-depth'),
]:
root = grace.filesystem_friendly_name(name)
full_name = os.path.join(dir_name, root + suffix + '.userplot')
full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot')
if not os.path.exists(full_name): continue
if not os.path.exists(full_ambig_name): continue
output_suffix = '-%d.userplot' % radius
print dir_name, root, output_suffix
depths = numpy.array( read_unstranded_userplot(full_name) )
ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) )
expect = expected_depth(root, seq, depths, ambig_depths, radius)
write_unstranded_userplot(
os.path.join(dir_name, root + suffix + '-expected' + output_suffix),
expect)
corrected = depths / expect * numpy.median(expect)
corrected[expect <= 5.0] = 0.0
write_unstranded_userplot(
os.path.join(dir_name, root + suffix + '-corrected' + output_suffix),
corrected)
ambig_corrected = ambig_depths / expect * numpy.median(expect)
ambig_corrected[expect <= 0.0] = 0.0
write_unstranded_userplot(
os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix),
ambig_corrected)
开发者ID:drpowell,项目名称:nesoni,代码行数:42,代码来源:trivia.py
示例14: set_sequences
def set_sequences(self, filenames):
reference_genbank_filename = self / 'reference.gbk'
reference_filename = self / 'reference.fa'
reference_genbank_file = open(reference_genbank_filename,'wb')
any_genbank = [ False ]
def genbank_callback(name, record):
""" Make a copy of any genbank files passed in. """
from Bio import SeqIO
SeqIO.write([record], reference_genbank_file, 'genbank')
f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
SeqIO.write([record], f, 'genbank')
f.close()
any_genbank[0] = True
lengths = [ ]
seen = set()
f = open(reference_filename, 'wb')
for filename in filenames:
for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback):
name = name.split()[0]
assert name not in seen, 'Duplicate chromosome name: ' + name
seen.add(name)
lengths.append( (name, len(seq)) )
io.write_fasta(f, name, seq)
f.close()
self.set_object(lengths, 'reference-lengths.pickle.gz')
reference_genbank_file.close()
if not any_genbank[0]:
os.unlink(reference_genbank_filename)
# Create an index of the reference sequences for samtools
io.execute([
'samtools', 'faidx', reference_filename
])
开发者ID:Victorian-Bioinformatics-Consortium,项目名称:nesoni,代码行数:40,代码来源:reference_directory.py
示例15: run
def run(self):
#mincov, args = grace.get_option_value(args, '--mincov', int, 1)
#maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
#minsize, args = grace.get_option_value(args, '--minsize', int, 200)
#what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
#is_core = (what == 'core')
#
#grace.expect_no_further_options(args)
#
#if len(args) < 2:
# print >> sys.stderr, HELP
# raise grace.Help_shown()
#
#output_dir, working_dirs = args[0], args[1:]
#
##assert not path.exists(path.join(output_dir, 'reference.fa')), \
#assert not path.exists(path.join(output_dir, 'parameters')), \
# 'Output directory not given'
#
#if not path.exists(output_dir):
# os.mkdir(output_dir)
assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
is_core = (self.what == 'core')
workspace = self.get_workspace()
for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
self.log.log(name + '\n')
friendly_name = grace.filesystem_friendly_name(name)
good = [ True ] * len(seq)
for working_dir in self.working_dirs:
if is_core:
suffix = '-depth.userplot'
else:
suffix = '-ambiguous-depth.userplot'
data = trivia.read_unstranded_userplot(
os.path.join(working_dir, friendly_name+suffix)
)
assert len(seq) == len(data)
for i in xrange(len(seq)):
if good[i]:
if is_core:
good[i] = data[i] >= self.mincov
else:
good[i] = data[i] < self.mincov
#Close holes
start = -self.maxdiff-1
n_holes = 0
for i in xrange(len(seq)):
if good[i]:
if 0 < i-start <= self.maxdiff:
for j in xrange(start,i): good[j] = True
n_holes += 1
start = i+1
self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
io.write_fasta(f, name,
''.join([ (seq[i] if good[i] else 'N')
for i in xrange(len(seq)) ])
)
f.close()
f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
io.write_fasta(f, name,
''.join([ (seq[i] if good[i] else seq[i].lower())
for i in xrange(len(seq)) ])
)
f.close()
f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
start = 0
n_good = [0]
n_good_bases = [0]
def emit(i):
if i-start < self.minsize: return
if good[start]:
n_good[0] += 1
n_good_bases[0] += i-start
io.write_fasta(
f_good if good[start] else f_nongood,
'%s:%d..%d' % (name, start+1,i),
seq[start:i]
)
for i in xrange(1,len(seq)):
if good[i] != good[start]:
emit(i)
start = i
emit(len(seq))
f_nongood.close()
f_good.close()
self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:core.py
示例16: recombination
def recombination(args):
grace.expect_no_further_options(args)
if len(args) != 2:
print >> sys.stderr, USAGE
raise grace.Help_shown()
working_dir, seq_name = args
references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa')))
depth = { }
prefixes = { }
suffixes = { }
for name in references:
depth[name] = numpy.zeros(len(references[name]), 'int64')
prefixes[name] = [ [] for base in references[name] ]
suffixes[name] = [ [] for base in references[name] ]
def register_divergence(hit):
if not hit.query_forward:
hit = hit.reversed()
margin = 20
if hit.target_end - hit.target_start < 20:
return False
depth[hit.target_name][hit.target_start : hit.target_end] += 1
any = False
if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq):
suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] )
any = True
if hit.query_start >= margin: # and hit.target_start > 0:
prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] )
any = True
return any
n = 0
for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir):
# Skip reads containing Ns
if 'N' in read_seq: continue
for line in hits:
register_divergence(alignment_from_shrimp(line, references, read_name, read_seq))
n += 1
#if n > 100000:
# break
if n%10000 == 0:
grace.status('Processing read %s' % grace.pretty_number(n))
grace.status('')
def show_items(items):
original_length = len(items)
cut = 0
while len(items) > 80:
cut += 1
items = [ item for item in items if item[0] >= cut ]
for item in items:
print item[1]
if len(items) < original_length:
print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1)
def score(items):
if not items: return 1.0
return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2)
def summarize_prefixes(seqs, pad):
seqs = sorted(seqs, key=lambda seq: seq[::-1])
cut = 100
while True:
items = [ ]
for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]):
ss = list(iterator)
anylong = any( item != seq for item in ss )
n = len(ss)
items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) )
if score(items) >= 1.0/20: break
cut -= 1
show_items(items)
def summarize_suffixes(seqs, pad):
seqs = sorted(seqs)
cut = 100
while True:
items = [ ]
for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]):
ss = list(iterator)
anylong = any( item != seq for item in ss )
n = len(ss)
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:recombination.py
示例17: fill_scaffolds
def fill_scaffolds(args):
max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
if len(args) < 2:
print USAGE
return 1
(output_dir, graph_dir), args = args[:2], args[2:]
scaffolds = [ ]
def scaffold(args):
circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
scaffold = [ ]
for item in args:
scaffold.append( ('contig', int(item)) )
scaffold.append( ('gap', None) )
if not circular: scaffold = scaffold[:-1]
name = 'custom_scaffold_%d' % (len(scaffolds)+1)
scaffolds.append( (name, scaffold) )
grace.execute(args, [scaffold])
custom_scaffolds = (len(scaffolds) != 0)
sequences = dict(
(a.split()[0], b.upper())
for a,b in
io.read_sequences(os.path.join(
graph_dir, '454AllContigs.fna')))
sequence_names = sorted(sequences)
sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
contexts = { }
context_names = { }
context_depths = { }
for i in xrange(1,len(sequence_names)+1):
seq = sequences[sequence_names[i-1]]
contexts[ i ] = seq
context_names[ i ] = sequence_names[i-1]+'-fwd'
contexts[ -i ] = bio.reverse_complement(seq)
context_names[ -i ] = sequence_names[i-1]+'-rev'
links = collections.defaultdict(list)
for line in open(
os.path.join(graph_dir, '454ContigGraph.txt'),
'rU'):
parts = line.rstrip('\n').split('\t')
if parts[0].isdigit():
seq = sequence_ids[parts[1]]
context_depths[ seq] = float(parts[3])
context_depths[-seq] = float(parts[3])
if parts[0] == 'C':
name1 = 'contig%05d' % int(parts[1])
dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
name2 = 'contig%05d' % int(parts[3])
dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
depth = int(parts[5])
#print name1, dir1, name2, dir2, depth
links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
if parts[0] == 'S' and not custom_scaffolds:
name = 'scaffold%05d' % int(parts[2])
components = parts[3].split(';')
scaffold = [ ]
for component in components:
a,b = component.split(':')
if a == 'gap':
scaffold.append( ('gap',int(b)) )
else:
strand = { '+': +1, '-': -1 }[ b ]
scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
scaffolds.append( (name, scaffold) )
#paths = { }
#
#todo = [ ]
#for i in contexts:
# for depth_left, neg_left in links[-i]:
# left = -neg_left
# for depth_right, right in links[i]:
# todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
#
#heapq.heapify(todo)
#while todo:
# score, source, dest, path = heapq.heappop(todo)
# if (source,dest) in paths: continue
#
# paths[(source,dest)] = path
#.........这里部分代码省略.........
开发者ID:Puneet-Shivanand,项目名称:nesoni,代码行数:101,代码来源:fill_scaffolds.py
示例18: run
def run(self):
#assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
# Reference genome
#chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
chromosomes = collections.OrderedDict(io.read_sequences(self.reference))
def get_interpeak_seq(peaks):
start = min(item.transcription_stop for item in peaks)
end = max(item.transcription_stop for item in peaks)
if end-start > self.max_seq: return ''
if peaks[0].strand >= 0:
return chromosomes[peaks[0].seqid][start:end]
else:
return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])
def get_prepeak_seq(gene,peaks):
if gene.strand >= 0:
start = gene.utr_pos
end = min(item.transcription_stop for item in peaks)
if end-start > self.max_seq: return ''
return chromosomes[gene.seqid][start:end]
else:
start = max(item.transcription_stop for item in peaks)
end = gene.utr_pos
if end-start > self.max_seq: return ''
return bio.reverse_complement(chromosomes[gene.seqid][start:end])
# Normalization files
if self.norm_file:
norm_file = self.norm_file
else:
nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
norm_file = self.prefix+'-norm.csv'
norms = io.read_grouped_table(norm_file, [('All',str)])['All']
pair_norm_names = [ ]
pair_norms = [ ]
for i in xrange(len(norms)):
pair_norm_names.append(norms.keys()[i]+'-peak1')
pair_norms.append(norms.values()[i])
for i in xrange(len(norms)):
pair_norm_names.append(norms.keys()[i]+'-peak2')
pair_norms.append(norms.values()[i])
io.write_grouped_csv(
self.prefix+'-pairs-norm.csv',
[('All',io.named_list_type(pair_norm_names)(pair_norms))],
comments=['#Normalization'],
)
# Read data
annotations = list(annotation.read_annotations(self.parents))
if self.utrs:
utrs = list(annotation.read_annotations(self.utrs))
else:
utrs = [ ]
children = list(annotation.read_annotations(self.children))
count_table = io.read_grouped_table(self.counts, [
('Count',int),
('Tail_count',int),
('Tail',_float_or_none),
('Proportion',_float_or_none),
('Annotation',str)
])
counts = count_table['Count']
tail_counts = count_table['Tail_count']
proportions = count_table['Proportion']
tails = count_table['Tail']
samples = counts.value_type().keys()
sample_tags = { }
for line in count_table.comments:
if line.startswith('#sampleTags='):
parts = line[len('#sampleTags='):].split(',')
assert parts[0] not in sample_tags
sample_tags[parts[0]] = parts
for item in children:
item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
parents = [ ]
id_to_parent = { }
for item in annotations:
if item.type != self.parent_type: continue
assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
parents.append(item)
id_to_parent[item.get_id()] = item
item.children = [ ]
#item.cds = [ ]
# Default utr
if item.strand >= 0:
item.utr_pos = item.end
else:
item.utr_pos = item.start
#.........这里部分代码省略.........
开发者ID:stu2,项目名称:tail-tools,代码行数:101,代码来源:alternative_tails.py
示例19: run
def run(self):
references = { }
for filename in self.reference_filenames:
for name, seq in io.read_sequences(filename):
references[name] = seq
tail_lengths = { }
adaptor_bases = { }
for filename in self.clips:
with io.open_possibly_compressed_file(filename) as f:
for line in f:
if line.startswith('#'): continue
parts = line.rstrip('\n').split('\t')
name = parts[0].split()[0]
tail_lengths[name] = int(parts[3])-int(parts[2])
adaptor_bases[name] = int(parts[6])
in_file = self.begin_input()
out_file = self.begin_output()
assert self.prop_a >= 0.0 and self.prop_a <= 1.0
a_score = 1-self.prop_a
non_a_score = -self.prop_a
for line in in_file:
line = line.rstrip()
if line.startswith('@'):
print >> out_file, line
continue
al = Alignment(line)
if al.flag & FLAG_UNMAPPED:
continue
#ref = references[al.rname]
reverse = al.flag & FLAG_REVERSE
if reverse:
read_bases = rev_comp(al.seq)
read_qual = al.qual[::-1]
cigar = cigar_decode(al.cigar)[::-1]
else:
read_bases = al.seq
read_qual = al.qual
cigar = cigar_decode(al.cigar)
n_tail = tail_lengths[al.qname]
#if reverse:
# if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
# bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])
#else:
# if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
# bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
#
#extension = 0
#while extension < n_tail and bases_ref[extension] == 'A':
# extension += 1
if reverse:
feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
else:
feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
bases_ref = feat.get_seq(references).upper()
# Allow up to 60% mismatch on As
# Treat soft clipping as insertion for simplicity
cigar = cigar.replace("S","I")
assert "H" not in cigar, "Can't handle hard clipping"
extension = 0
best_score = 0.0
score = 0.0
# Soft clipping treated as a mismatch
i = len(cigar)-1
while i >= 0 and cigar[i] in "I":
score += non_a_score
i -= 1
for i in xrange(n_tail):
if bases_ref[i] == "A":
score += a_score
else:
score += non_a_score
if score >= best_score:
extension = i+1
best_score = score
#print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref
if n_tail-extension > 0:
al.extra.append('AN:i:%d' % (n_tail-extension))
al.extra.append('AG:i:%d' % (extension))
if adaptor_
|
请发表评论