• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pybedtools.BedTool类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pybedtools.BedTool的典型用法代码示例。如果您正苦于以下问题:Python BedTool类的具体用法?Python BedTool怎么用?Python BedTool使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了BedTool类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: generate_bed_file_annotations

def generate_bed_file_annotations(bed_directory, output_directory, loci):
    """
        Generates the annotation file for every bed file in the bed_directory folder
    """
    
    # Loop over the bed files in the bed directory.
    bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed"))
    logging.info("Start to generate BED file annotations")
    logging.info("Writing annotation to: {0}/".format(output_directory))
    for locus in loci:
        zscore = os.path.join(output_directory, locus) 
        bed_lines, rsids = _bed_from_zscore(zscore)
        tmp_bed = open("tmp.bed","w").writelines(bed_lines)
        snps = BedTool("tmp.bed")
        no_snps = _get_line_number(zscore)
        a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps)
        logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore))
        for beds in bed_file_list:
            test_annotation = BedTool(beds)
            inter = snps.intersect(test_annotation)
            idxs = []
            for inte in inter:
                idxs.append(rsids.index(inte.name))
            zeroes = np.zeros(len(rsids))
            for idx in idxs:
                zeroes[idx] = 1
            a_matrix.add_annotation(zeroes, beds)
        annotations_file = os.path.join(output_directory, locus + ".annotations")
        logging.info("Writing annotation matrix to: {0}".format(annotations_file))
        a_matrix.write_annotations(annotations_file)
        os.remove("tmp.bed")
开发者ID:theboocock,项目名称:fine_mapping_pipeline,代码行数:31,代码来源:annotation.py


示例2: xstream

def xstream(a, b, distance, updown, out):
    """
    find all things in b that are within
    distance of a in the given direction
    (up or down-stream)
    """
    direction = dict(u="l", d="r")[updown[0]]
    kwargs = {'sw':True, direction: distance}

    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0
    a = BedTool(a).saveas()

    kwargs['stream'] = True
    c = a.window(b, **kwargs)
    afields = a.field_count()

    seen = collections.defaultdict(set)
    for feat in c:
        key = "\t".join(feat[:afields])
        # keep track of all the feature names that overlap this one
        seen[key].update((feat[afields + 3],))

    # the entries that did appear in the window
    for row in seen:
        out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        out.write(str(row) + "\t.\n")
    out.flush()
    assert len(BedTool(out.name)) == len(a)
开发者ID:GunioRobot,项目名称:bio-playground,代码行数:34,代码来源:superanno.py


示例3: get_coverage

def get_coverage(bed_prefix, directory, file_prefix, bam):
    """
    Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any
    false negative sites
    :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections
    :param directory: location of patient results
    :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient
    :return out: filename for coverage stats
    """
    #TODO change BAM path so filename is not required
    print 'Generating coverage stats.'
    whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed'
    out = directory + '/giab_results/whole_bed_coverage.txt'
    command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \
              ' ' + bam + ' > ' + out + '.tmp'
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'Sambamba complete.'
    #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add
    # them to the file at coverage 0
    temp_bed = out.replace('.txt', '.bed.tmp')
    command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed
    print command
    try:
        subprocess.check_call(command, shell=True)
        print 'BED coordinates extracted.'
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)


    coverage_bed = BedTool(temp_bed)
    print 'BED tool created'
    whole_bedtool = BedTool(whole_bed)
    print 'Intersecting'
    missing_regions = whole_bedtool.intersect(coverage_bed, v=True)
    missing_file = directory + '/giab_results/regions_missing'
    missing_regions.moveto(missing_file)
    print 'Generating file'
    sample_split = file_prefix.split('-')
    sample = sample_split[1] + '-' + sample_split[2]
    command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add'
    print command
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)

    command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'fix complete.'
    return out
开发者ID:sch-sdgs,项目名称:SDGSValidation,代码行数:60,代码来源:giab_comparison_freebayes.py


示例4: segmentations

def segmentations(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    results = {}
    intersection = v.intersect(feats, wb=True)
    if len(intersection) > 0:
        sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        call(sort_cmd1, shell=True)
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
        for entry in annots: 
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    names = {
        'CTCF': 'CTCF_REG', 
        'E':    'ENH', 
        'PF':   'TSS_FLANK', 
        'R':    'REP', 
        'T':    'TRAN', 
        'TSS':  'TSS', 
        'WE':   'WEAK_ENH'
    }

    return DataFrame(results, index=names.keys()).T.rename(columns=names)   
开发者ID:tianxiahuihui,项目名称:bioinformatics,代码行数:26,代码来源:anno_info2.py


示例5: filter_bed

def filter_bed(bedfile, snp_list, outfile=sys.stdout):
    """Filter a bedfile to only include snps in snp_list, print to outfile.

    :bedfile:  A bed file of all the SNPs, can be gzipped.
    :snp_list: List/tuple/set/frozenset of snp names.
    :outfile:  Something .bed or .bed.gz, deault STDOUT.
    :returns:  0 on success 1 on failure

    """
    try:
        from pybedtools import BedTool
    except ImportError:
        logme.log('pybedtools is not installed.\n' +
                  'Please install and try again. You can get it from here:\n' +
                  'https://github.com/daler/pybedtools',
                  level='error')
        return -1

    if not isinstance(snp_list, (tuple, list, set, frozenset)):
        raise Exception('snp_list must be tuple/list/set/frozenset ' +
                        'it is: {}'.format(type(snp_list)))

    bed      = BedTool(bedfile)
    filtered = bed.filter(lambda a: a.name in snp_list)

    with open_zipped(outfile, 'w') as fout:
        fout.write(str(filtered))
开发者ID:rmagoglia,项目名称:ASEr,代码行数:27,代码来源:snps.py


示例6: calculate_ovl

def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
    cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
    sh(cmd, infile=ab.fn, outfile=scoresfile)
开发者ID:rrane,项目名称:jcvi,代码行数:7,代码来源:reformat.py


示例7: annotate_peaks

def annotate_peaks(notsif, beds, names):
    """Takes notsif, transforms to bed, and outputs annotation of where the 
    miRNA seed is interrogating via Cytoscape edge attribute file.
    """
    strand = find_strand_from_filename(notsif)

    mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True)

    # create the reference beds
    reference = {}
    for name, bed in izip(names, beds):
        reference[name] = BedTool(bed)

    for name in names:

        # intersect the mirna bed with the reference annotations
        for hit in mirna_bed.intersect(reference[name], s=True, stream=True):

            # name field returned from notsif_to_bed is delimited by "|"
            mirna_name = hit.name.split("|")[0]
            gene_name = hit.name.split("|")[1]
            # Cytoscape formatting
            seed_length = "(%s)" % hit.score
            fields = (mirna_name, seed_length, gene_name, "=", name)
            print " ".join(map(str, fields))
开发者ID:brwnj,项目名称:cu_projects,代码行数:25,代码来源:peak_annotation.py


示例8: _get

def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
开发者ID:vladsaveliev,项目名称:TargQC,代码行数:35,代码来源:__init__.py


示例9: gene_regions

def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse')

        for entry in annots:
            regions = {}
            for region in entry[4].split(','):  
                if region in regions:
                    regions[region] += 1
                else:
                    regions[region] = 1

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
开发者ID:tianxiahuihui,项目名称:bioinformatics,代码行数:27,代码来源:anno_num.py


示例10: calc_origin_bkgd_freqs

def calc_origin_bkgd_freqs(bedtool, strand, fasta_filename, verbose):

    # add strand to bedtool
    if strand == 'pos':
        strand_char = '+'
    elif strand == 'neg':
        strand_char = '-'

    intervals = []
    for row in bedtool:
        # input is BED6, output needs BED6
        row.strand = strand_char
        intervals.append(row)

    stranded_bedtool = BedTool(intervals)

    fastatool = stranded_bedtool.sequence(fi=fasta_filename, s=True)

    kwargs = {'region_size_min':1,
              'region_size_max':1,
              'ignore_chroms':[],
              'only_chroms':[],
              'verbose':verbose}

    if verbose:
        print >>sys.stderr, ">> calculating background freqs ..."

    result = calc_bkgd_counts(fastatool.seqfn, **kwargs)

    return result
开发者ID:hesselberthlab,项目名称:modmap,代码行数:30,代码来源:origin_analysis.py


示例11: getNegativeDatasetFASTA

def getNegativeDatasetFASTA(config):
	try:
		coordinates = BedTool(config['negativesBedFile'])
		genome = BedTool(config['maize_genome_filepath'])
		dataset = coordinates.sequence(fi=genome, fo=config['negative_dataset_output'])
	except ValueError:
		print 'getNegativeDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
开发者ID:Tay2510,项目名称:PyCorn,代码行数:7,代码来源:driver.py


示例12: sequence_from_bedfile

def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0):
    """Fasta sequences from set of genomic features in a bed file
        Args:
            fastafile: fasta file with genomic sequence
            features: dataframe of features/coords with bed file col names
            bedfile: optionally provide a bed file instead
            pad5,pad3: flanking sequence at 5' or 3' ends
        Returns:
            a pandas dataframe with name, sequence and coord columns"""

    from pybedtools import BedTool
    if bedfile != None:
        features = utils.bed_to_dataframe(bedfile)
    new = []
    for n,r in features.iterrows():
        if r.strand == '+':
            coords = (r.chr,r.chromStart-pad5,r.chromEnd+pad3)
            seq = str(BedTool.seq(coords, fastafile))
        else: #reverse strand
            coords = (r.chr,r.chromStart-pad3,r.chromEnd+pad5)
            seq = str(BedTool.seq(coords, fastafile))
            seq = HTSeq.Sequence(seq).get_reverse_complement()
        #print n, coords, r['name']
        new.append([r['name'],str(seq),coords])
    new = pd.DataFrame(new, columns=['name','seq','coords'])
    return new
开发者ID:dmnfarrell,项目名称:mirnaseq,代码行数:26,代码来源:utils.py


示例13: main

def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('bed', help='bed with miRNA as name')
    p.add_argument('--reference-beds', dest='reference', nargs='+', 
        help='reference beds for each feature to annotate')
    p.add_argument('--names', nargs='+', 
        help='names corresponding to reference files')
    args = p.parse_args()
    if not args.names and not args.reference:
        sys.exit(p.print_help())
    
    bed = BedTool(args.bed)
    
    # create the reference beds
    reference = {}
    for refname, refbed in izip(args.names, args.reference):
        reference[refname] = BedTool(refbed)
    
    for refname in args.names:
    
        # intersect the mirna bed with the reference annotations
        for b in bed.intersect(reference[refname], s=True, stream=True):
            # Cytoscape formatting
            fields = (b.name, "=", refname)
            print " ".join(map(str, fields))
开发者ID:brwnj,项目名称:cu_projects,代码行数:26,代码来源:node_annotation.py


示例14: gene_regions

def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        #call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')

        for entry in annots:
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
开发者ID:tianxiahuihui,项目名称:bioinformatics,代码行数:29,代码来源:anno_info2.py


示例15: main

def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('peaks', help='peaks bed')
    p.add_argument('exons', help='refseq exons from UCSC')
    p.add_argument('gtf', help='refseq gtf with feature of interest')
    p.add_argument('feature', help='feature of interest in the gtf')
    p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
    args = p.parse_args()
    
    if args.verbose: sys.stderr.write(">> building exon library...\n")
    exon_lib = make_exon_lib(args.exons)
    
    peaks = BedTool(args.peaks)
    exons = BedTool(args.exons)
    full_ref = BedTool(args.gtf)
    
    if args.verbose: sys.stderr.write(">> filtering for feature...\n")
    filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
    
    if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
    exonic_peaks = peaks.intersect(exons, wo=True)
    
    if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
    # D for distance (returns negative if upstream)
    for peak in exonic_peaks.closest(filtered_ref, D="a"):
        try:
            p = ComplexLine(peak)
            corrected_distance = 0.0
            total_exon_length = 0.0
            # parse gtf attrs
            gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')

            # looking downstream wrt peak
            if p.gtfdistance > 0:
                # exon with peak
                corrected_distance = p.exonstop - p.peakstop
                for exon in exon_lib[p.exoninfo.name]:
                    # add downstream exon lengths
                    if exon > p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
                        
            # looking upstream wrt peak
            else:
                # exon with peak
                corrected_distance = p.peakstart - p.exonstart
                for exon in exon_lib[p.exoninfo.name]:
                    # add upstream exon lengths
                    if exon < p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
            
            for exon in exon_lib[p.exoninfo.name]:
                total_exon_length += exon_lib[p.exoninfo.name][exon]
            
            # fraction
            print (corrected_distance / total_exon_length)
        
        except ValueError:
            continue
开发者ID:brwnj,项目名称:cu_projects,代码行数:59,代码来源:relative_cluster_positions.py


示例16: gc_content

def gc_content(vf, fa, flank=50):
    print "inside gc_content"
    v = BedTool(vf)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    nc = flanks.nucleotide_content(fi=fa)
    results = dict([ (r.name, float(r[5])) for r in nc ])
    print "exiting gc_content"
    return Series(results, name="GC") 
开发者ID:amanchahar,项目名称:ComplexVariants,代码行数:8,代码来源:gwava_annotate.py


示例17: getPositiveDatasetFASTA

def getPositiveDatasetFASTA(config):
	if (not os.path.isfile(config['positive_dataset_output'])):
		try:
			coordinates = BedTool(config['bed_file_post'])
			genome = BedTool(config['maize_genome_filepath'])
			dataset = coordinates.sequence(fi=genome, fo=config['positive_dataset_output'])
		except ValueError:
			print 'getPositiveDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
开发者ID:Tay2510,项目名称:PyCorn,代码行数:8,代码来源:driver.py


示例18: feat_dist

def feat_dist(vf, af, name):
    print "inside feat_dist"
    v = BedTool(vf)
    a = BedTool(af)
    closest = v.closest(a, D="b")
    results = dict([ (r.name, int(r[len(r.fields)-1])) for r in closest ])
    print "exiting feat_dist"
    return Series(results, name=name)
开发者ID:amanchahar,项目名称:ComplexVariants,代码行数:8,代码来源:gwava_annotate.py


示例19: motifs

def motifs(vf, af):
    print "inside motif"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit motif"
    return Series(results, name="pwm")
开发者ID:amanchahar,项目名称:ComplexVariants,代码行数:8,代码来源:gwava_annotate.py


示例20: cpg_islands

def cpg_islands(vf, af):
    print "inside cpg_islands"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit cpg_islands"
    return Series(results, name="cpg_island")
开发者ID:amanchahar,项目名称:ComplexVariants,代码行数:8,代码来源:gwava_annotate.py



注:本文中的pybedtools.BedTool类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pybedtools.IntervalFile类代码示例发布时间:2022-05-25
下一篇:
Python pybedtools.set_tempdir函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap