本文整理汇总了Python中pysam.faidx函数的典型用法代码示例。如果您正苦于以下问题:Python faidx函数的具体用法?Python faidx怎么用?Python faidx使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了faidx函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: gatk_realigner
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None,
out_file=None, deep_coverage=False):
"""Realign a BAM file around indels using GATK, returning sorted BAM.
"""
runner = broad.runner_from_config(config)
runner.run_fn("picard_index", align_bam)
runner.run_fn("picard_index_ref", ref_file)
if not os.path.exists("%s.fai" % ref_file):
pysam.faidx(ref_file)
if region:
align_bam = subset_bam_by_region(align_bam, region, out_file)
runner.run_fn("picard_index", align_bam)
if has_aligned_reads(align_bam, region):
variant_regions = config["algorithm"].get("variant_regions", None)
realign_target_file = gatk_realigner_targets(runner, align_bam,
ref_file, dbsnp, region,
out_file, deep_coverage,
variant_regions)
realign_bam = gatk_indel_realignment(runner, align_bam, ref_file,
realign_target_file, region,
out_file, deep_coverage)
# No longer required in recent GATK (> Feb 2011) -- now done on the fly
# realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
return realign_bam
elif out_file:
shutil.copy(align_bam, out_file)
return out_file
else:
return align_bam
开发者ID:brentp,项目名称:bcbio-nextgen,代码行数:29,代码来源:realign.py
示例2: __init__
def __init__(self, filename):
if not os.path.exists(filename + '.fai'):
import pysam
pysam.faidx(filename)
self.fasta = open(filename)
self.index = self.load_index(filename + '.fai')
开发者ID:hyeshik,项目名称:tailseeker,代码行数:7,代码来源:sequtils.py
示例3: extracAllels
def extracAllels(chrom, vcf_coord, var_descr, genref):
'''compute alternative allel from description in bed file.
must be one of sub(C->T), ins(CCCT), del(5)
'''
ref_allel = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord)[1].strip()
if 'sub' in var_descr:
yy = var_descr.split('->')
ref = yy[0][-1]
if ref == ref_allel:
return '\t'.join([ref, yy[1][0]])
else:
print 'ref allels do not match, exiting...'
print chrom, vcf_coord, var_descr
sys.exit(1)
elif 'ins' in var_descr:
yy = var_descr.split('(')[1]
return '\t'.join([ref_allel, ref_allel + yy[:-1]])
elif 'del' in var_descr:
yy = var_descr.split('(')[1]
del_len = int(yy[:-1])
vcf_coord_end = str(int(vcf_coord) + int(del_len))
ref = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord_end)[1].strip()
if ref[0] == ref_allel:
return '\t'.join([ref, ref_allel])
else:
print 'ref allels do not match in del, exiting...'
print chrom, vcf_coord, var_descr
sys.exit(1)
else:
print 'format not found, exiting...'
sys.exit(1)
开发者ID:asalomatov,项目名称:misc_scripts,代码行数:31,代码来源:ios2table.py
示例4: bed_tofasta
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout):
if not os.path.exists('%s.fai' % ref_fasta):
pysam.faidx(ref_fasta)
fasta = pysam.Fastafile(ref_fasta)
refs = set()
with open('%s.fai' % ref_fasta) as f:
for line in f:
refs.add(line.split('\t')[0].strip())
name = ''
for region in bed:
if include_name:
name = '%s|' % (region.name.strip())
if region.end - region.start >= min_size and region.chrom in refs:
seq = fasta.fetch(region.chrom, region.start, region.end)
if stranded and region.strand:
if region.strand == '-':
seq = revcomp(seq)
out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq))
else:
out.write('>%s%s:%d-%d%s\n%s\n' % (name, region.chrom, region.start, region.end, seq))
fasta.close()
开发者ID:dickgroenenberg,项目名称:ngsutils,代码行数:26,代码来源:tofasta.py
示例5: resolved_tool_contract_runner
def resolved_tool_contract_runner(resolved_contract):
rc = resolved_contract
alignment_path = rc.task.input_files[0]
reference_path = rc.task.input_files[1]
gff_path = rc.task.output_files[0]
vcf_path = rc.task.output_files[1]
dataset_path = rc.task.output_files[2]
fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
fastq_path = rc.task.output_files[3]
args = [
alignment_path,
"--verbose",
"--reference", reference_path,
"--outputFilename", gff_path,
"--outputFilename", fasta_path,
"--outputFilename", fastq_path,
"--outputFilename", vcf_path,
"--numWorkers", str(rc.task.nproc),
"--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
"--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
"--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
bool(rc.task.options[Constants.MASKING_ID]) else "0",
"--algorithm", rc.task.options[Constants.ALGORITHM_ID],
"--alignmentSetRefWindows",
]
args_ = get_parser().arg_parser.parser.parse_args(args)
rc = args_runner(args_)
if rc == 0:
pysam.faidx(fasta_path)
ds = ContigSet(fasta_path, strict=True)
ds.write(dataset_path)
return rc
开发者ID:PacificBiosciences,项目名称:GenomicConsensus,代码行数:32,代码来源:main.py
示例6: generate_data_files
def generate_data_files(dir_name=None):
import logging
logging.basicConfig(level=logging.INFO)
if dir_name is not None:
os.chdir(dir_name)
with open("tst1.fasta", "w") as f:
f.write(">ecoliK12_pbi_March2013_2955000_to_2980000\n")
f.write("AAAGAGAGAG" * 2500)
pysam.faidx("tst1.fasta")
for i in range(len(sam_strings)):
sam_file = "tst_%d_subreads.sam" % (i + 1)
bam_file = "tst_%d_subreads.bam" % (i + 1)
with open(sam_file, "w") as sam_out:
sam_out.write(sam_strings[i])
logging.info("Converting {s} to BAM".format(s=sam_file))
# FIXME pysam is way broken - can't handle unmapped input?
# convert to bam using pysam
# with pysam.AlignmentFile(sam_file, "r", check_sq=False) as sam_in:
# with pysam.AlignmentFile(bam_file, "wb",
# template=sam_in) as bam_out:
# for s in sam_in:
# bam_out.write(s)
args = ["samtools", "view", "-b", "-o", bam_file, sam_file]
assert subprocess.call(args) == 0, args
os.remove(sam_file)
# XXX don't create .pbi for this file, we want it to be absent
if bam_file != "tst_2_subreads.bam":
logging.info("Indexing {b}".format(b=bam_file))
subprocess.call(["pbindex", bam_file])
开发者ID:mpkocher,项目名称:pbcoretools,代码行数:29,代码来源:test_pbvalidate_bam.py
示例7: run
def run(self):
AbstractAnalysis.run(self) #Call base method to do some logging
localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam")
localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted")
samToBamFile(self.samFile, localBamFile)
pysam.sort(localBamFile, localSortedBamFile)
pysam.index(localSortedBamFile + ".bam")
pysam.faidx(self.referenceFastaFile)
file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] + "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1]
consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf")
consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq")
system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \
% (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf))
system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq))
system("rm -rf %s" % (self.referenceFastaFile + ".fai"))
formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq")
formatConsensusFastq(consensus_fastq, formatted_consensus_fastq)
system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq))
self.finish()
开发者ID:mitenjain,项目名称:nanopore,代码行数:25,代码来源:consensus.py
示例8: merge_mut2
def merge_mut2(mutation_file_list, output_file, reference):
mut2sample = {}
sample_ind = 0
for mut_file in mutation_file_list:
sample_ind = sample_ind + 1
is_vcf = True if mut_file.endswith(".vcf") or mut_file.endswith(".vcf.gz") else False
hin2 = gzip.open(mut_file, 'r') if mut_file.endswith(".gz") else open(mut_file, 'r')
for line2 in hin2:
F2 = line2.rstrip('\n').split('\t')
if F2[0].startswith('#'): continue
if F2[0] == "Chr": continue
if is_vcf == False:
pos, ref, alt = F2[1], F2[3], F2[4]
# insertion
if F2[3] == "-":
# get the sequence for the reference base
seq = ""
for item in pysam.faidx(reference, F2[0] + ":" + str(F2[1]) + "-" + str(F2[1])):
seq = seq + item.rstrip('\n')
seq = seq.replace('>', '')
seq = seq.replace(F2[0] + ":" + str(F2[1]) + "-" + str(F2[1]), '')
ref, alt = seq, seq + F2[4]
# deletion
if F2[4] == "-":
# get the sequence for the reference base
seq = ""
for item in pysam.faidx(reference, F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1)):
seq = seq + item.rstrip('\n')
seq = seq.replace('>', '')
seq = seq.replace(F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1), '')
pos, ref, alt = str(int(F2[1]) - 1), seq + F2[3], seq
QUAL = 60
INFO = "SOMATIC"
key = '\t'.join([F2[0], pos, '.', ref, alt, str(QUAL), "PASS", INFO])
else:
key = '\t'.join(F2[0:8])
if key not in mut2sample:
mut2sample[key] = []
mut2sample[key].append(str(sample_ind))
sample_num = sample_ind
hout = open(output_file, 'w')
for mut in sorted(mut2sample):
if len(mut2sample[mut]) == sample_num: continue
print >> hout, mut + '\t' + ','.join(mut2sample[mut])
hout.close()
开发者ID:Genomon-Project,项目名称:GenomonSplicingMutation,代码行数:60,代码来源:preprocess.py
示例9: resolved_tool_contract_runner
def resolved_tool_contract_runner(resolved_contract):
rc = resolved_contract
alignment_path = rc.task.input_files[0]
reference_path = rc.task.input_files[1]
gff_path = rc.task.output_files[0]
dataset_path = rc.task.output_files[1]
fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
fastq_path = rc.task.output_files[2]
args = [
alignment_path,
"--verbose",
"--reference", reference_path,
"--outputFilename", gff_path,
"--outputFilename", fasta_path,
"--outputFilename", fastq_path,
"--numWorkers", str(rc.task.nproc),
"--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
"--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
"--algorithm", rc.task.options[Constants.ALGORITHM_ID],
"--alignmentSetRefWindows",
]
if rc.task.options[Constants.DIPLOID_MODE_ID]:
args.append("--diploid")
args_ = get_parser().arg_parser.parser.parse_args(args)
rc = args_runner(args_)
if rc == 0:
pysam.faidx(fasta_path)
ds = ContigSet(fasta_path, strict=True)
ds.write(dataset_path)
return rc
开发者ID:lpp1985,项目名称:lpp_Script,代码行数:30,代码来源:main.py
示例10: run
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level):
#'--log-file foo.log',
#'--verbose',
#'--debug', # requires 'ipdb'
#'-j NWORKERS',
#'--algorithm quiver',
#'--diploid', # binary
#'--minConfidence 40',
#'--minCoverage 5',
#'--alignmentSetRefWindows',
cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
system(cmd.format(**locals()))
try:
say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
# Convert to contigset.xml
import pysam
pysam.faidx(fasta) # pylint: disable=no-member
# I do not know why pylint does not see this defined.
ds = ContigSet(fasta, strict=True)
ds.write(contigset, relPaths=True)
say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset))
except Exception:
say(traceback.format_exc())
say('Skipping conversion to contigset.')
开发者ID:PacificBiosciences,项目名称:FALCON-polish,代码行数:26,代码来源:run_variantCaller.py
示例11: fetch_file
def fetch_file(options):
if len(options) != 4:
sys.exit('fetch_ucsc.py hg19/hg38/mm10 ref/kg/ens/fa out')
if options[1] in {'hg19', 'hg38', 'mm10'}:
path = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/' % options[1]
else:
sys.exit('Only support human or mouse!')
s = string.maketrans(' ', '_')
if options[2] == 'ref': # RefSeq gene annotations
urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz')
with open(options[3], 'w') as outf:
outf.write(gzip.open('refFlat.txt.gz', 'rb').read())
elif options[2] == 'kg': # KnownGenes gene annotations
urllib.urlretrieve(path + 'database/knownGene.txt.gz',
'knownGene.txt.gz')
urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz')
kg_iso = {}
with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f:
for line in kg_id_f:
iso = line.split('\t')[0]
gene = line.split('\t')[4].translate(s)
kg_iso[iso] = gene
with gzip.open('knownGene.txt.gz', 'rb') as kg_f:
with open(options[3], 'w') as outf:
for line in kg_f:
entry = line.split('\t')
iso = entry[0]
outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n')
elif options[2] == 'ens': # Ensembl gene annotations
if options[1] == 'hg38':
sys.exit('No Ensembl gene annotations for hg38!')
urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz')
urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz',
'ensemblToGeneName.txt.gz')
ens_iso = {}
with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f:
for line in ens_id_f:
iso, gene = line.split()
ens_iso[iso] = gene
with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
with open(options[3], 'w') as outf:
for line in ens_f:
entry = line.split()
iso = entry[1]
outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
elif options[2] == 'fa': # Genome sequences
if options[1] == 'hg38':
fa_path = 'bigZips/hg38.chromFa.tar.gz'
else:
fa_path = 'bigZips/chromFa.tar.gz'
urllib.urlretrieve(path + fa_path, 'chromFa.tar.gz')
with tarfile.open('chromFa.tar.gz', 'r:gz') as fa:
with open(options[3], 'w') as outf:
for f in fa:
if f.isfile():
outf.write(fa.extractfile(f).read())
pysam.faidx(options[3])
else:
sys.exit('Only support ref/kg/ens/fa!')
开发者ID:BioXiao,项目名称:CIRCexplorer2,代码行数:59,代码来源:fetch_ucsc.py
示例12: write_fa_subset
def write_fa_subset(seq_names, infile, outfile):
if not os.path.exists(infile + '.fai'):
pysam.faidx(infile)
f = pyfastaq.utils.open_file_write(outfile)
for name in seq_names:
print(pysam.faidx(infile, name), end='', file=f)
pyfastaq.utils.close(f)
开发者ID:andrewjpage,项目名称:ariba,代码行数:8,代码来源:faidx.py
示例13: check_fasta
def check_fasta(fa_f, pysam_flag=True):
if not os.path.isfile(fa_f + '.fai'):
pysam.faidx(fa_f)
if pysam_flag: # return pysam FastaFile object
fa = pysam.FastaFile(fa_f)
return fa
else: # return fasta file path
return fa_f
开发者ID:YangLab,项目名称:CIRCexplorer2,代码行数:8,代码来源:parser.py
示例14: _generate_chunk_output_file
def _generate_chunk_output_file(self, i=None):
fn = tempfile.NamedTemporaryFile(suffix=".fasta").name
suffix = "|arrow"
with open(fn, "w") as f:
header, seq = self.CHUNK_CONTIGS[i]
f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
pysam.faidx(fn)
return self._make_dataset_file(fn)
开发者ID:mpkocher,项目名称:pbcoretools,代码行数:8,代码来源:test_tasks_scatter_gather.py
示例15: __init__
def __init__(self, num, refname):
self.num = int(num)
self.refname = refname
if not os.path.exists('%s.fai' % refname):
pysam.faidx(refname)
self.ref = pysam.Fastafile(refname)
开发者ID:pjbriggs,项目名称:tools-iuc,代码行数:8,代码来源:filter.py
示例16: ensure_fasta_index
def ensure_fasta_index(fasta_fname):
"""Ensure a FASTA file is indexed for samtools, to enable fast lookup."""
fai_fname = fasta_fname + '.fai'
if not is_newer_than(fai_fname, fasta_fname):
echo("Indexing FASTA file", fasta_fname)
pysam.faidx(fasta_fname)
assert os.path.isfile(fai_fname), "Failed to generate index " + fai_fname
return fai_fname
开发者ID:roryk,项目名称:cnvkit,代码行数:8,代码来源:faidx.py
示例17: index_fasta
def index_fasta(infile):
"""index fasta file using samTools"""
if os.path.isfile(infile):
pass
else:
print >>sys.stderr, "Indexing " + infile + " ...",
pysam.faidx(infile)
print >>sys.stderr, "Done!"
开发者ID:pombredanne,项目名称:presentations-9,代码行数:8,代码来源:cpat.py
示例18: checkFASTA
def checkFASTA( fastaFileStr ):
fastaIndex = fastaFileStr + '.fai'
if os.path.isfile(fastaFileStr) == False:
print('ERROR: FASTA file does not exist')
exit()
elif os.path.isfile(fastaIndex) == False:
print('WARNING: FASTA index file does not exist...creating')
pysam.faidx( fastaFileStr )
return True
开发者ID:bhofmei,项目名称:analysis-scripts,代码行数:9,代码来源:region_read_allele.py
示例19: run_spades_parallel
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
max_interval_size=SPADES_MAX_INTERVAL_SIZE,
timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
pybedtools.set_tempdir(work)
logger.info("Running SPAdes on the intervals in %s" % bed)
if not bed:
logger.info("No BED file specified")
return None, None
bedtool = pybedtools.BedTool(bed)
total = bedtool.count()
chrs = set(chrs)
all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
interval.chrom in chrs]
selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
all_intervals)
ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
all_intervals)
pool = multiprocessing.Pool(nthreads)
assembly_fastas = []
for i in xrange(nthreads):
intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
"timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
"max_read_pairs": max_read_pairs}
pool.apply_async(run_spades_single, kwds=kwargs_dict,
callback=partial(run_spades_single_callback, result_list=assembly_fastas))
pool.close()
pool.join()
logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
assembled_fasta = os.path.join(work, "spades_assembled.fa")
with open(assembled_fasta, "w") as assembled_fd:
for line in fileinput.input(assembly_fastas):
assembled_fd.write("%s\n" % (line.strip()))
if os.path.getsize(assembled_fasta) > 0:
logger.info("Indexing the assemblies")
pysam.faidx(assembled_fasta)
else:
logger.error("No assembly generated")
assembled_fasta = None
ignored_bed = None
if ignored_intervals:
ignored_bed = os.path.join(work, "ignored.bed")
pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)
pybedtools.cleanup(remove_all=True)
return assembled_fasta, ignored_bed
开发者ID:thongnt2,项目名称:metasv,代码行数:57,代码来源:run_spades.py
示例20: get_genome_stats
def get_genome_stats(genome_fasta):
reference_fasta_index = genome_fasta + '.fai'
if not os.path.exists(reference_fasta_index):
print("\nIndexing %s\n" % os.path.abspath(genome_fasta))
pysam.faidx(genome_fasta)
reference_genome = pysam.FastaFile(genome_fasta)
total_bases = sum(reference_genome.lengths)
return reference_genome.nreferences, total_bases
开发者ID:BennerLab,项目名称:atg,代码行数:10,代码来源:index.py
注:本文中的pysam.faidx函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论