• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pybloom.ScalableBloomFilter类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pybloom.ScalableBloomFilter的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter类的具体用法?Python ScalableBloomFilter怎么用?Python ScalableBloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了ScalableBloomFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: vacuum_all

	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:35,代码来源:supplier_catalog_item_version_task.py


示例2: URLFilter

class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
开发者ID:heroxdream,项目名称:information-retrieval,代码行数:33,代码来源:URLFilter.py


示例3: FilterHandler

class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:34,代码来源:url_filter_service.py


示例4: _load_from_file

 def _load_from_file(self):
   self.logger_.info('loading data from cache file...')
   if not os.path.isfile('data/bloom.data'):
     self.logger_.error('bloom cache file not found, create one instead.')
     self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
   else:
     with open('data/bloom.data', 'r') as f:
       self.deduper_ = ScalableBloomFilter.fromfile(f)
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:8,代码来源:url_filter_service.py


示例5: __init__

  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
开发者ID:FireAVR,项目名称:BloomAutoYara,代码行数:8,代码来源:BloomAutoYara.py


示例6: WishPipeline

class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
开发者ID:yangxue088,项目名称:wish,代码行数:10,代码来源:pipelines.py


示例7: test_bloom_string

    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:13,代码来源:test_pybloom.py


示例8: to_bloomfilter

def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
开发者ID:Faiz7412,项目名称:itpy,代码行数:15,代码来源:sketch.py


示例9: test_bloom_int

    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:16,代码来源:test_pybloom.py


示例10: RequestFilter

class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
开发者ID:kaito-kidd,项目名称:mini-scrapy,代码行数:16,代码来源:scheduler.py


示例11: get_category_conversion

	def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
		"""Category Conversion"""
		if self.category_conversion_filter is None:
			self.category_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				CategoryConversionModel.supplier_id,
				CategoryConversionModel.manufacturer_id,
				CategoryConversionModel.needle
			)
			for row in query.yield_per(100):
				self.category_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_id, category_identifier)
		if row in self.category_conversion_filter:
			query = self.session.query(CategoryConversionModel)
			query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
			query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
			query = query.filter(CategoryConversionModel.needle == category_identifier)
			try:
				category_conversion = query.one()
				return category_conversion
			except NoResultFound:
				pass

		category_conversion = CategoryConversionModel()
		category_conversion.manufacturer_id = manufacturer_id
		category_conversion.supplier_id = supplier_id
		category_conversion.needle = category_identifier
		self.session.add(category_conversion)
		self.category_conversion_filter.add(row)
		return category_conversion
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:31,代码来源:supplier_catalog_item_task.py


示例12: __init__

 def __init__(self, initial_capacity=1000, error_rate=0.0001):
     self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
             error_rate=error_rate,
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     # False positives in the Bloom filter will cause us to fail to
     # garbage-collect an object.  Salt the Bloom filter to ensure
     # that we get a different set of false positives on every run.
     self._bloom_salt = os.urandom(2)
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:8,代码来源:util.py


示例13: __init__

 def __init__(self, source_image):
     self.source_image = source_image
     self.bloom_filter = ScalableBloomFilter(
         initial_capacity=source_image.tiles.count(),
         error_rate=0.0001,  # 1 in 10,000
     )
     existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
     for tile_id, existing_match_id in existing_matches:
         self.bloom_filter.add((tile_id, existing_match_id))
开发者ID:pipermerriam,项目名称:mozy,代码行数:9,代码来源:exclusions.py


示例14: __init__

 def __init__(self, spider):
     super(BFSFrontier, self).__init__(spider)
     self._spider = spider
     self.args = {'rules': [],
                  'order': 'bfs'}
     self.redis = RediSugar.getConnection()
     self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     self.todo = spider.name + '-todo'
     self.visited = spider.name + '-visited'
     self._feedfilter()
开发者ID:ymero,项目名称:PyCrawler,代码行数:10,代码来源:frontier.py


示例15: count_distinct_approx

def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
开发者ID:Faiz7412,项目名称:itpy,代码行数:20,代码来源:sketch.py


示例16: BloomSet

class BloomSet(object):
    def __init__(self, initial_capacity=1000, error_rate=0.0001):
        self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
                error_rate=error_rate,
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        # False positives in the Bloom filter will cause us to fail to
        # garbage-collect an object.  Salt the Bloom filter to ensure
        # that we get a different set of false positives on every run.
        self._bloom_salt = os.urandom(2)

    def add(self, name):
        self._set.add(self._bloom_key(name))

    def __contains__(self, name):
        # May return false positives.
        return self._bloom_key(name) in self._set

    def _bloom_key(self, name):
        if isinstance(name, unicode):
            name = name.encode('utf-8')
        return self._bloom_salt + name
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:21,代码来源:util.py


示例17: main

def main(args):
    seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    for ln in sys.stdin:
        if not ln:
            continue
        fetchedUrl = json.loads(ln)

        # continue if we've seen this url already.
        if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
            continue

        # add unseen url to the url set
        seenUrlSet.add(fetchedUrl["url"])
        seenUrlSet.add(fetchedUrl["effective_url"])

        # extract links and filter out some urls by url filter.
        outlinks = url_filter(extract_links(fetchedUrl))

        # analyze

        print "[postproc]%s" % fetchedUrl["url"]
开发者ID:etman,项目名称:xPyCrawler,代码行数:21,代码来源:postproc.py


示例18: vacuum_all

	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()
		ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
		tx = transaction.get()
		
		try:
			#s = set()
			s = ScalableBloomFilter()
			query = DBSession.query(SupplierSpecialModel.id)
			for (supplier_special_id, ) in query.yield_per(100):
				s.add(supplier_special_id)
			
			for plug in self.plugins.itervalues():
				supplier_special_filter_id = plug.supplier_special_filter_id()
				model_name = plug.version_model()  + 'Model'
				VersionModel = getattr(model, model_name)
				query = DBSession.query(VersionModel)
				if limit:
					query = query.order_by(VersionModel.vacuumed.nullsfirst())
					query = query.limit(limit)

				ts['sub_done'] = 0
				ts['sub_total'] = query.count()
				for supplier_special_item_version in query.yield_per(10):
					if supplier_special_item_version.supplier_special_id not in s:
						logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
						DBSession.delete(supplier_special_item_version)
					ts['sub_done'] += 1
					if ts['sub_done'] % 1000 == 0:
						DBSession.flush()
				DBSession.flush()
				ts['done'] += 1
		except Exception:
			logger.exception('Caught Exception: ')
			tx.abort()
		finally:
			ts.finish()
		transaction.commit()
		logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:40,代码来源:supplier_special_item_version_task.py


示例19: get_scale_conversion

	def get_scale_conversion(self, supplier_id, scale_identifier):
		"""Scale Conversion"""
		
		if scale_identifier is None:
			return None
		if supplier_id is None:
			return None


		if self.scale_conversion_filter is None:
			self.scale_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				ScaleConversionModel.supplier_id,
				ScaleConversionModel.scale_identifier
			)
			for row in query.yield_per(100):
				self.scale_conversion_filter.add(row)
		
		row = (supplier_id, scale_identifier)
		if row in self.scale_conversion_filter:
			
			query = self.session.query(ScaleConversionModel)
			query = query.filter(ScaleConversionModel.supplier_id == supplier_id)
			query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier)
			
			try:
				scale_conversion = query.one()
				return scale_conversion
			except NoResultFound:
				pass

		query = self.session.query(ScaleModel)
		query = query.filter(ScaleModel.name == scale_identifier)

		try:
			scale = query.one()
		except NoResultFound:
			scale = None

		if scale is not None:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = scale.id
			return scale_conversion
		else:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = None
			scale_conversion.supplier_id = supplier_id
			scale_conversion.scale_identifier = scale_identifier
			self.session.add(scale_conversion)
			self.scale_conversion_filter.add(row)
			self.session.flush()
			return scale_conversion
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:52,代码来源:supplier_catalog_item_task.py


示例20: get_price_control

	def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special):
		"""Price Control"""
		if self.price_control_filter is None:
			self.price_control_filter = ScalableBloomFilter()
			query = self.session.query(
				PriceControlModel.supplier_id,
				PriceControlModel.manufacturer_id
			)
			for row in query.yield_per(100):
				self.price_control_filter.add(row)
		
		row = (supplier_id, manufacturer_id)
		if row in self.price_control_filter:
			query = self.session.query(PriceControlModel)
			query = query.filter(PriceControlModel.supplier_id == supplier_id)
			query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id)
			if preorder:
				query = query.filter(PriceControlModel.preorder == True)
				
			if special:
				query = query.filter(PriceControlModel.special == True)
			
			if (not preorder) and (not special):
				query = query.filter(PriceControlModel.normal == True)
			
			query = query.filter(PriceControlModel.retail_low <= retail)
			query = query.filter(PriceControlModel.retail_high >= retail)
			query = query.filter(PriceControlModel.enable == True)
			try:
				price_control = query.one()
				return price_control
			except NoResultFound:
				#logger.warning(
				#	"No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
				#	supplier_id, 
				#	manufacturer_id, 
				#	retail, 
				#	preorder, 
				#	special
				#)
				return None
			except MultipleResultsFound:
				logger.warning(
					"Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
					supplier_id, 
					manufacturer_id, 
					retail, 
					preorder, 
					special
				)
		return None
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:51,代码来源:supplier_catalog_item_task.py



注:本文中的pybloom.ScalableBloomFilter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pybloomfilter.BloomFilter类代码示例发布时间:2022-05-25
下一篇:
Python pybloom.BloomFilter类代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap