• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pybloomfilter.BloomFilter类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pybloomfilter.BloomFilter的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter类的具体用法?Python BloomFilter怎么用?Python BloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了BloomFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, path=FILTER_PATH, debug=False):
     if os.path.exists(FILTER_PATH):
         self.url_filter = BloomFilter.open(FILTER_PATH)
     else:
         print "created a new bloom filter. "
         self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
     super(DuplicateFilter, self).__init__(path, debug)
开发者ID:JASON0916,项目名称:DianpingSpider,代码行数:7,代码来源:duplicate_filter.py


示例2: create_ref_bloom_filter

def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
    """From a given FASTA reference sequence creates a bloom filter file
    from each read.
    """

    if format == "fasta":
    	file_it = FastaIterator
        record = lambda it: (seq.seq for seq in it)
    elif format == "fastq":
        file_it = FastqGeneralIterator
        record = lambda it: (seq for _, seq, _ in it)

    capacity = total_reads(reference_file)
    with open(reference_file) as handle:
        it = file_it(handle)
        read_it = record(it)
        read_len = 109
        read_in = []
        read = []
        buffer = []
        
        bf = BloomFilter(capacity, error_rate, bf_file)
        sequence = read_it.next()

        step = read_len
        
        i = 0
        while i < len(sequence):
            read = sequence[i:i + read_len - 1]
            i += step
            print(read)
            bf.update(read)
                
        bf.close()
开发者ID:vals,项目名称:Boutonniere,代码行数:34,代码来源:boutonniere.py


示例3: LinkFilter

class LinkFilter():
    
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
    
    def index_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf_index.add(link.url):
                new_links.append(link)
        return new_links

    def html_filter(self, links):
        new_links = []
        for link in links:
            #log.msg('This is a link : %s' % link, level=log.WARNING)
            if not self.bf_html.add(link.url):
                new_links.append(link)
        return new_links
开发者ID:wangjie1991,项目名称:crawler,代码行数:30,代码来源:linkfilter.py


示例4: main

def main():
   #Check for command line arguments
   if len(sys.argv) != 2:
      print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
      sys.exit(1)

   #Read arguments from command line
   inFile = sys.argv[1]


   bf1 = BloomFilter(100000000, 0.001, 'bf1')   
   bf2 = BloomFilter(100000000, 0.001, 'bf2')
     
   outputFileName="converted-"+sys.argv[1]
   f = open(outputFileName, "a")



   for line in open(inFile,'r'):
      if (line[0:2]=="W," or line[0:2]=="R,"):
         hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
         hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
         if (bf1.add(hash1) and bf2.add(hash2)):
         	f.write('%s,%d\n' % (line[0],hash1*10000) )
         else:
        	   f.write('%s,%d\n' % (line[0],hash2*10000) )  
      elif(line==''):
         break
      else:
         pass
   f.close()
开发者ID:theopengroup,项目名称:EAD,代码行数:31,代码来源:convert.py


示例5: __init__

	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
开发者ID:salmonx,项目名称:crawler,代码行数:55,代码来源:gevent_redis_multiprocess.py


示例6: __init__

    def __init__(self):
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile("new_filter.bloom"):
            self.bf = BloomFilter.open("new_filter.bloom")
        else:
            self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
开发者ID:mylinlan,项目名称:spider,代码行数:12,代码来源:gzrb.py


示例7: __init__

  def __init__(self, node_n, seen_persist, Q_logs=None):
    self.node_n = node_n
    self.Q_logs = Q_logs
    self.total_crawled = 0
    self.payloads_dropped = 0

    # single variable for tracking whether node should be active or not
    self.active = True
    
    # crawl task Queue
    # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
    self.Q_crawl_tasks = Queue.PriorityQueue()

    # host queue dict
    # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
    self.hqs = {}
    
    # seen url check
    # Bloom Filter ~ [ url ]
    if seen_persist:
      try:
        self.seen = BloomFilter.open(BF_FILENAME)
      except:
        self.Q_logs.put('Error opening bloom filter, creating new one')
        self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
    else:
      self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)

    # DNS Cache
    # { netloc: (host_addr, time_last_checked) }
    self.DNScache = {}

    # overflow url Queue
    # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
    self.Q_overflow_urls = Queue.Queue()

    # host queue cleanup Queue
    # Priority Queue ~ [ (time_to_delete, host_addr) ]
    self.Q_hq_cleanup = Queue.PriorityQueue()

    # active url count queue- for counting/tracking active
    # Queue ~ [ True ]
    self.Q_active_count = Queue.Queue()

    # thread active url dict- a dict of active urls by thread using, for restart dump
    # { thread_name: active_url }
    # NOTE: note that there are problems with this methodology, but that errors will only lead
    # to data redundancy (as opposed to omission)...
    self.thread_active = {}
    
    # Queue of messages to be sent to other nodes
    # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
    self.Q_to_other_nodes = Queue.Queue()
开发者ID:abresler,项目名称:RL-crawler,代码行数:53,代码来源:urlFrontier.py


示例8: dedup

def dedup(fname):
    bf = BloomFilter(1E8, 0.01)
    
    with open(fname, 'r') as fin:
        with open('deduped.tsv', 'w') as fout:
            for line in fin:
                splitLine = line.split('\t')
                description = splitLine[5]
                if bf.add(md5.new(description).digest()):
                    continue
                else:
                    fout.write(line)
开发者ID:jisaacso,项目名称:team-thorn,代码行数:12,代码来源:deduper.py


示例9: create_bf

def create_bf():
	bf = BloomFilter(count, error_rate, 'filter_base.bloom')
	keyDigest_list = []
	FILE = open(keyDigestFile, 'r')
	
	for i in range(count):
		keyDigest = FILE.read(keyDigestLen)
		keyDigest_list.append(keyDigest)
		
	FILE.close()
	
	for publicKeyID in keyDigest_list:
		bf.add(publicKeyID)
开发者ID:enzocxt,项目名称:bloomfilter,代码行数:13,代码来源:bloomfilter.py


示例10: __init__

    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
开发者ID:wangjie1991,项目名称:crawler,代码行数:13,代码来源:linkfilter.py


示例11: __init__

    def __init__(self, start_url, basic_url):
        self.basic_url = basic_url
        self.start_url = start_url
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile('filter.bloom'):
            self.bf = BloomFilter.open('filter.bloom')
        else:
            self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
开发者ID:mylinlan,项目名称:spider,代码行数:14,代码来源:myspider.py


示例12: __init__

 def __init__(self):
     bc = config.get_boolmfilter_config()
     if os.path.exists(bc['bin_path']):
         self.bloomfilter = BloomFilter.open(bc['bin_path'])
     else:
         self.bloomfilter = BloomFilter(
             bc['capacity'], bc['wrong_rate'], bc['bin_path'])
开发者ID:intohole,项目名称:mortred,代码行数:7,代码来源:utils.py


示例13: __init__

 def __init__(self, settings, debug = False):
     self.capacity = settings.getint("DUPEFILTER_CAPACITY")
     self.filename = settings.get("DUPEFILTER_FILENAME")
     self.debug = debug
     self.error_rate = 0.01
     self.logger = logging.getLogger(__name__)
     self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:7,代码来源:bloom_filter.py


示例14: __init__

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        self.db = MySQLdb.connect("localhost","root","","storecount")
        self.cursor = self.db.cursor()
        self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
        sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
    
        try:
            self.cursor.execute(sql1)
            self.db.commit()
#             print "cao create"
        except:
            traceback.print_exc()
            self.db.rollback()
#         self.dbpool = adbapi.ConnectionPool('MySQLdb',
#                                             host = '127.0.0.1',
#                                             db = 'storecount',
#                                             user = 'root',
#                                             passwd = '',
#                                             cursorclass = MySQLdb.cursors.DictCursor,
#                                             charset = 'utf8',
#                                             use_unicode = True)
        self.mark = 0
开发者ID:wybini,项目名称:search-engine,代码行数:27,代码来源:pipelines.py


示例15: DuplicatesPipeline

class DuplicatesPipeline(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()

    def process_item(self, item, spider):
        print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
开发者ID:PeinYu,项目名称:SearchEngine,代码行数:28,代码来源:pipelines.py


示例16: initdb

class URLBloomFilter:
    dbconn = None
    cur = None
    urlbf = None
    sql = None

    def initdb(self, host = 'localhost', user = 'muye', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
        self.dbconn = MySQLConnection.MySQLConn()
        self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
        self.cur = self.dbconn.cursor()

    def initfilter(self, filename = './url.filter'):
        if os.path.isfile(filename):
            self.urlbf = BloomFilter.open(filename)
        else:
            self.urlbf = BloomFilter(10000000, 0.001, filename)

    def initsql(self, m_sql):
        self.sql = m_sql

    def add(self, url):
        if not self.urlbf.add(url):
            self.cur.execute(self.sql, url)
            return True
        else:
            return False

    def close(self):
        self.dbconn.close()
开发者ID:muye5,项目名称:muye5code,代码行数:29,代码来源:URLFilter.py


示例17: __init__

 def __init__(self, roots,
              exclude=None, strict=True,  # What to crawl.
              max_redirect=10, max_tries=4,  # Per-url limits.
              max_tasks=10, *, loop=None):
     self.loop = loop or asyncio.get_event_loop()
     self.roots = roots
     self.exclude = exclude
     self.strict = strict
     self.max_redirect = max_redirect
     self.max_tries = max_tries
     self.max_tasks = max_tasks
     self.q = Queue(loop=self.loop)
     self.seen_urls = BloomFilter(10000000, 0.01)
     self.done = []
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.root_domains = set()
     for root in roots:
         parts = urllib.parse.urlparse(root)
         host, port = urllib.parse.splitport(parts.netloc)
         if not host:
             continue
         if re.match(r'\A[\d\.]*\Z', host):
             self.root_domains.add(host)
         else:
             host = host.lower()
             if self.strict:
                 self.root_domains.add(host)
             else:
                 self.root_domains.add(lenient_host(host))
     for root in roots:
         self.add_url(root)
     self.t0 = time.time()
     self.t1 = None
开发者ID:ramsayleung,项目名称:betacat,代码行数:33,代码来源:crawling.py


示例18: vote

def vote(request, poll):
    try:
        choice_name = request.POST['choice']
        selected_choice = poll.choice_set.get(choice=choice_name)
    except (KeyError, Choice.DoesNotExist):
        return render_to_response('detail.html', {'poll':poll, 'error_message':"You didn't select a choice."},
                context_instance= RequestContext(request))

    if not (poll.has_expired() or already_voted(request, poll)):
        hash = request_hash(request)
        poll.total_votes += 1
        selected_choice.votes += 1
        poll.vote_set.create(hash=hash)
        selected_choice.save()

        #Update the seen ips
        from pybloomfilter import BloomFilter
        bf = BloomFilter.from_base64('/tmp/bloom.filter', poll.ips_seen)
        alreadyseen = bf.add(request.META['REMOTE_ADDR'])

        if not alreadyseen:
            poll.ips_seen = bf.to_base64()
            poll.ips_count += 1

        poll.save()

    return None
开发者ID:sbadame,项目名称:polling,代码行数:27,代码来源:views.py


示例19: count_matches

def count_matches(fastq_file, bf_files, sampling):
    """Goes through a fastq file and checks a sample of reads if they
    occur in the specified bloom filter.
    """
    if isinstance(bf_files, basestring):
        bf_files = [bf_files]

    bf = {}
    observed = {}
    for bf_file in bf_files:
        bf[bf_file] = BloomFilter.open(bf_file)
        observed[bf_file] = 0

    fastq_handle = open(fastq_file)
    fastq_it = FastqGeneralIterator(fastq_handle)
    checked = 0
    sampling = int(sampling)
   # import ipdb
   # ipdb.set_trace()
    for i, (_, read, _) in enumerate(fastq_it):
        if not i + 1 % sampling:
            continue

        print read

        checked += 1
        for bf_file in bf_files:
            if read in bf[bf_file]:
                observed[bf_file] += 1

    fastq_handle.close()

    return checked, observed
开发者ID:vals,项目名称:Boutonniere,代码行数:33,代码来源:boutonniere.py


示例20: MongoDBPipeline

class MongoDBPipeline(object):

    def __init__(self):
        connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.si = SearchIndex()
        self.si.SearchInit()
        
    def process_item(self, item, spider):
        if self.bf.add(item['link']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            for data in item:
                if not data:
                    raise DropItem("Missing data!")
            self.collection.update({'link': item['link']}, dict(item), upsert=True)
            log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
            self.si.AddIndex(item)
            return item
        
    def __del__(self):
        self.si.IndexDone()
开发者ID:pianer,项目名称:SearchLaw,代码行数:27,代码来源:pipelines.py



注:本文中的pybloomfilter.BloomFilter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pdf.BoletoPDF类代码示例发布时间:2022-05-25
下一篇:
Python pybloom.ScalableBloomFilter类代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap