本文整理汇总了Python中pyspider.libs.utils.md5string函数的典型用法代码示例。如果您正苦于以下问题:Python md5string函数的具体用法?Python md5string怎么用?Python md5string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了md5string函数的19个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: crawl_list_page
def crawl_list_page(self, response):
db_cookie = self.get_cookie() or {}
r_cookie = response.cookies
print db_cookie
print r_cookie
db_ctime = int(db_cookie.get('ctime', 0))
r_ctime = int(r_cookie.get('ctime', 0))
if self.check_captcha(response):
if db_ctime <= r_ctime:
db_cookie = self.verify_vcode(response)
if not db_cookie:
raise Exception('sougou_weixin refresh cookies fail!')
#self.crawl(response.url, callback=self.crawl_list_page, cookies=db_cookie, save=response.save, force_update=True)
else:
# response.cookies.update(cookies)
# 更新cookies 会导致无法转跳到 detail页面
for each in response.doc(self.LIST_ANCHOR_SEL).items():
taskid = md5string(each.text())
self.crawl(each.attr.href, taskid=taskid, callback=self.detail_page, save=response.save, cookies=response.cookies)
开发者ID:ThomasLsm,项目名称:PythonCodes,代码行数:25,代码来源:weixin.py
示例2: _update_project
def _update_project(self, project):
"""update one project"""
if project["name"] not in self.projects:
self.projects[project["name"]] = {}
self.projects[project["name"]].update(project)
self.projects[project["name"]]["md5sum"] = utils.md5string(project["script"])
if not self.projects[project["name"]].get("active_tasks", None):
self.projects[project["name"]]["active_tasks"] = deque(maxlen=self.ACTIVE_TASKS)
# load task queue when project is running and delete task_queue when project is stoped
if project["status"] in ("RUNNING", "DEBUG"):
if project["name"] not in self.task_queue:
self._load_tasks(project["name"])
self.task_queue[project["name"]].rate = project["rate"]
self.task_queue[project["name"]].burst = project["burst"]
# update project runtime info from processor by sending a _on_get_info
# request, result is in status_page.track.save
self.on_select_task(
{
"taskid": "_on_get_info",
"project": project["name"],
"url": "data:,_on_get_info",
"status": self.taskdb.SUCCESS,
"fetch": {"save": ["min_tick"]},
"process": {"callback": "_on_get_info"},
}
)
else:
if project["name"] in self.task_queue:
self.task_queue[project["name"]].rate = 0
self.task_queue[project["name"]].burst = 0
del self.task_queue[project["name"]]
开发者ID:railroadman,项目名称:pyspider,代码行数:33,代码来源:scheduler.py
示例3: _update_project
def _update_project(self, project):
'''update one project'''
if project['name'] not in self.projects:
self.projects[project['name']] = {}
self.projects[project['name']].update(project)
self.projects[project['name']]['md5sum'] = utils.md5string(project['script'])
if not self.projects[project['name']].get('active_tasks', None):
self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS)
# load task queue when project is running and delete task_queue when project is stoped
if project['status'] in ('RUNNING', 'DEBUG'):
if project['name'] not in self.task_queue:
self._load_tasks(project['name'])
self.task_queue[project['name']].rate = project['rate']
self.task_queue[project['name']].burst = project['burst']
# update project runtime info from processor by sending a _on_get_info
# request, result is in status_page.track.save
self.on_select_task({
'taskid': '_on_get_info',
'project': project['name'],
'url': 'data:,_on_get_info',
'status': self.taskdb.SUCCESS,
'fetch': {
'save': ['min_tick', ],
},
'process': {
'callback': '_on_get_info',
},
})
else:
if project['name'] in self.task_queue:
self.task_queue[project['name']].rate = 0
self.task_queue[project['name']].burst = 0
del self.task_queue[project['name']]
开发者ID:Cloudebug,项目名称:pyspider,代码行数:35,代码来源:scheduler.py
示例4: _crawl
def _crawl(self, url, **kwargs):
task = {}
if kwargs.get('callback'):
callback = kwargs['callback']
if isinstance(callback, basestring) and hasattr(self, callback):
func = getattr(self, callback)
elif hasattr(callback, 'im_self') and callback.im_self is self:
func = callback
kwargs['callback'] = func.__name__
else:
raise NotImplementedError("self.%s() not implemented!" % callback)
if hasattr(func, '_config'):
for k, v in func._config.iteritems():
kwargs.setdefault(k, v)
if hasattr(self, 'crawl_config'):
for k, v in self.crawl_config.iteritems():
kwargs.setdefault(k, v)
url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
if kwargs.get('files'):
assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
kwargs.get('files', {}))
kwargs.setdefault('headers', {})
kwargs['headers']['Content-Type'] = content_type
kwargs['data'] = data
if kwargs.get('data'):
kwargs['data'] = _encode_params(kwargs['data'])
if kwargs.get('data'):
kwargs.setdefault('method', 'POST')
schedule = {}
for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
if key in kwargs and kwargs[key] is not None:
schedule[key] = kwargs[key]
if schedule:
task['schedule'] = schedule
fetch = {}
for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
if key in kwargs and kwargs[key] is not None:
fetch[key] = kwargs[key]
if fetch:
task['fetch'] = fetch
process = {}
for key in ('callback', ):
if key in kwargs and kwargs[key] is not None:
process[key] = kwargs[key]
if process:
task['process'] = process
task['project'] = self.project_name
task['url'] = url
task['taskid'] = task.get('taskid') or md5string(url)
self._follows.append(task)
return task
开发者ID:Keary,项目名称:pyspider,代码行数:60,代码来源:base_handler.py
示例5: test_put
def test_put(n):
logger.info("message queue put %d", n)
start_time = time.time()
for i in range(n):
task['url'] = 'http://bench.pyspider.org/?l=%d' % i
task['taskid'] = md5string(task['url'])
queue.put(task, block=True, timeout=1)
end_time = time.time()
cost_time = end_time - start_time
logger.info("cost %.2fs, %.2f/s %.2fms",
cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:11,代码来源:bench.py
示例6: test_update
def test_update(n, start=0):
logger.info("taskdb update %d" % n)
start_time = time.time()
for i in range(n):
task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
task['taskid'] = md5string(task['url'])
task['track'] = track
taskdb.update(task['project'], task['taskid'], task)
end_time = time.time()
cost_time = end_time - start_time
logger.info("cost %.2fs, %.2f/s %.2fms",
cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:12,代码来源:bench.py
示例7: test_insert
def test_insert(n, start=0):
logger.info("taskdb insert %d", n)
start_time = time.time()
for i in range(n):
task['url'] = 'http://bench.pyspider.org/?l={0:d}'.format((i + start))
task['taskid'] = md5string(task['url'])
task['track'] = {}
taskdb.insert(task['project'], task['taskid'], task)
end_time = time.time()
cost_time = end_time - start_time
logger.info("cost %.2fs, %.2f/s %.2fms",
cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
开发者ID:runt18,项目名称:pyspider,代码行数:12,代码来源:bench.py
示例8: test_get
def test_get(n, start=0, random=True, fields=request_task_fields):
logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
range_n = list(range(n))
if random:
from random import shuffle
shuffle(range_n)
start_time = time.time()
for i in range_n:
task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
task['taskid'] = md5string(task['url'])
task['track'] = track
taskdb.get_task(task['project'], task['taskid'], fields=fields)
end_time = time.time()
cost_time = end_time - start_time
logger.info("cost %.2fs, %.2f/s %.2fms",
cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:16,代码来源:bench.py
示例9: send_message
def send_message(ctx, scheduler_rpc, project, message):
"""
Send Message to project from command line
"""
if isinstance(scheduler_rpc, six.string_types):
scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
if scheduler_rpc is None and os.environ.get("SCHEDULER_NAME"):
scheduler_rpc = connect_rpc(ctx, None, "http://%s/" % (os.environ["SCHEDULER_PORT_23333_TCP"][len("tcp://") :]))
if scheduler_rpc is None:
scheduler_rpc = connect_rpc(ctx, None, "http://127.0.0.1:23333/")
return scheduler_rpc.send_task(
{
"taskid": utils.md5string("data:,on_message"),
"project": project,
"url": "data:,on_message",
"fetch": {"save": ("__command__", message)},
"process": {"callback": "_on_message"},
}
)
开发者ID:RacoonBattle,项目名称:pyspider,代码行数:20,代码来源:run.py
示例10: update
def update(self, project_info):
self.project_info = project_info
self.name = project_info['name']
self.group = project_info['group']
self.db_status = project_info['status']
self.updatetime = project_info['updatetime']
md5sum = utils.md5string(project_info['script'])
if (self.md5sum != md5sum or self.waiting_get_info) and self.active:
self._send_on_get_info = True
self.waiting_get_info = True
self.md5sum = md5sum
if self.active:
self.task_queue.rate = project_info['rate']
self.task_queue.burst = project_info['burst']
else:
self.task_queue.rate = 0
self.task_queue.burst = 0
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:20,代码来源:scheduler.py
示例11: send_message
def send_message(ctx, scheduler_rpc, project, message):
if isinstance(scheduler_rpc, six.string_types):
scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % (
os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
if scheduler_rpc is None:
scheduler_rpc = connect_rpc(ctx, None, 'http://localhost:23333/')
return scheduler_rpc.send_task({
'taskid': utils.md5string('data:,on_message'),
'project': project,
'url': 'data:,on_message',
'fetch': {
'save': ('__command__', message),
},
'process': {
'callback': '_on_message',
}
})
开发者ID:ConnorDFlynn,项目名称:Group1PySpider,代码行数:20,代码来源:run.py
示例12: _load_project
def _load_project(self, project):
'''Load project into self.projects from project info dict'''
try:
project['md5sum'] = utils.md5string(project['script'])
ret = self.build_module(project, self.env)
self.projects[project['name']] = ret
except Exception as e:
logger.exception("load project %s error", project.get('name', None))
ret = {
'loader': None,
'module': None,
'class': None,
'instance': None,
'exception': e,
'exception_log': traceback.format_exc(),
'info': project,
'load_time': time.time(),
}
self.projects[project['name']] = ret
return False
logger.debug('project: %s updated.', project.get('name', None))
return True
开发者ID:cash2one,项目名称:mytest,代码行数:22,代码来源:project_module.py
示例13: update
def update(self, project_info):
self.project_info = project_info
self.name = project_info['name']
self.group = project_info['group']
self.db_status = project_info['status']
self.updatetime = project_info['updatetime']
md5sum = utils.md5string(project_info['script'])
if (self.md5sum != md5sum or self.waiting_get_info) and self.active:
self._send_on_get_info = True
self.waiting_get_info = True
self.md5sum = md5sum
if self.active:
self.task_queue.rate = project_info['rate']
self.task_queue.burst = project_info['burst']
else:
self.task_queue.rate = 0
self.task_queue.burst = 0
logger.info('project %s updated, status:%s, paused:%s, %d tasks',
self.name, self.db_status, self.paused, len(self.task_queue))
开发者ID:trimpsyw,项目名称:pyspider,代码行数:23,代码来源:scheduler.py
示例14: get_taskid
def get_taskid(self, task):
'''Generate taskid by information of task md5(url) by default, override me'''
return md5string(task['url'])
开发者ID:M2shad0w,项目名称:pyspider,代码行数:3,代码来源:base_handler.py
示例15: get_taskid
def get_taskid(self, task):
"""Generate taskid by information of task md5(url) by default, override me"""
return md5string(task["url"])
开发者ID:shylou,项目名称:pyspider,代码行数:3,代码来源:base_handler.py
示例16: on_task
def on_task(self, task, response):
start_time = time.time()
try:
response = rebuild_response(response)
assert 'taskid' in task, 'need taskid in task'
project = task['project']
if project not in self.projects:
raise LookupError("no such project: %s" % project)
project_data = self.projects[project]
ret = project_data['instance'].run(
project_data['module'], task, response)
except Exception as e:
logger.exception(e)
return False
process_time = time.time() - start_time
if not ret.extinfo.get('not_send_status', False):
status_pack = {
'taskid': task['taskid'],
'project': task['project'],
'url': task.get('url'),
'track': {
'fetch': {
'ok': response.isok(),
'time': response.time,
'status_code': response.status_code,
'headers': dict(response.headers),
'encoding': response.encoding,
'content': (
response.content[:500]
if not response.isok() or ret.exception else
None
),
},
'process': {
'ok': not ret.exception,
'time': process_time,
'follows': len(ret.follows),
'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
'exception': ret.exception,
},
},
}
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
self.status_queue.put(utils.unicode_obj(status_pack))
for newtask in ret.follows:
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
self.newtask_queue.put(utils.unicode_obj(newtask))
for project, msg, url in ret.messages:
self.inqueue.put(({
'taskid': utils.md5string(url),
'project': project,
'url': url,
'process': {
'callback': '_on_message',
}
}, {
'status_code': 200,
'url': url,
'save': (task['project'], msg),
}))
if response.error or ret.exception:
logger_func = logger.error
else:
logger_func = logger.info
logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
task['project'], task['taskid'],
task.get('url'), response.status_code, len(response.content),
ret.result, len(ret.follows), len(ret.messages), ret.exception))
return True
开发者ID:7472741,项目名称:pyspider,代码行数:77,代码来源:processor.py
示例17: on_task
def on_task(self, task, response):
'''Deal one task'''
start_time = time.time()
response = rebuild_response(response)
try:
assert 'taskid' in task, 'need taskid in task'
project = task['project']
updatetime = task.get('project_updatetime', None)
md5sum = task.get('project_md5sum', None)
project_data = self.project_manager.get(project, updatetime, md5sum)
assert project_data, "no such project!"
if project_data.get('exception'):
ret = ProcessorResult(logs=(project_data.get('exception_log'), ),
exception=project_data['exception'])
else:
ret = project_data['instance'].run_task(
project_data['module'], task, response)
except Exception as e:
logstr = traceback.format_exc()
ret = ProcessorResult(logs=(logstr, ), exception=e)
process_time = time.time() - start_time
if not ret.extinfo.get('not_send_status', False):
if ret.exception:
track_headers = dict(response.headers)
else:
track_headers = {}
for name in ('etag', 'last-modified'):
if name not in response.headers:
continue
track_headers[name] = response.headers[name]
status_pack = {
'taskid': task['taskid'],
'project': task['project'],
'url': task.get('url'),
'track': {
'fetch': {
'ok': response.isok(),
'redirect_url': response.url if response.url != response.orig_url else None,
'time': response.time,
'error': response.error,
'status_code': response.status_code,
'encoding': response.encoding,
'headers': track_headers,
'content': response.text[:500] if ret.exception else None,
},
'process': {
'ok': not ret.exception,
'time': process_time,
'follows': len(ret.follows),
'result': (
None if ret.result is None
else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
),
'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
'exception': ret.exception,
},
'save': ret.save,
},
}
if 'schedule' in task:
status_pack['schedule'] = task['schedule']
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
self.status_queue.put(utils.unicode_obj(status_pack))
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
# logger.info('process follows :%s' % ret.follows)
# logger.info('process messages :%s' % ret.messages)
if ret.follows:
for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):
self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])
for project, msg, url in ret.messages:
try:
self.on_task({
'taskid': utils.md5string(url),
'project': project,
'url': url,
'process': {
'callback': '_on_message',
}
}, {
'status_code': 200,
'url': url,
'save': (task['project'], msg),
})
except Exception as e:
logger.exception('Sending message error.')
continue
if ret.exception:
logger_func = logger.error
else:
logger_func = logger.info
#.........这里部分代码省略.........
开发者ID:cash2one,项目名称:mytest,代码行数:101,代码来源:processor.py
示例18: on_task
def on_task(self, task, response):
start_time = time.time()
try:
response = rebuild_response(response)
assert 'taskid' in task, 'need taskid in task'
project = task['project']
updatetime = task.get('updatetime', None)
project_data = self.project_manager.get(project, updatetime)
if not project_data:
logger.error("no such project: %s", project)
return False
ret = project_data['instance'].run(
project_data['module'], task, response)
except Exception as e:
logger.exception(e)
return False
process_time = time.time() - start_time
if not ret.extinfo.get('not_send_status', False):
if ret.exception:
track_headers = dict(response.headers)
else:
track_headers = {}
for name in ('etag', 'last-modified'):
if name not in response.headers:
continue
track_headers[name] = response.headers[name]
status_pack = {
'taskid': task['taskid'],
'project': task['project'],
'url': task.get('url'),
'track': {
'fetch': {
'ok': response.isok(),
'redirect_url': response.url if response.url != response.orig_url else None,
'time': response.time,
'error': response.error,
'status_code': response.status_code,
'encoding': response.encoding,
'headers': track_headers,
'content': response.content[:500] if ret.exception else None,
},
'process': {
'ok': not ret.exception,
'time': process_time,
'follows': len(ret.follows),
'result': (
None if ret.result is None
else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
),
'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
'exception': ret.exception,
},
},
}
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
self.status_queue.put(utils.unicode_obj(status_pack))
# FIXME: unicode_obj should used in scheduler before store to database
# it's used here for performance.
if ret.follows:
self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in ret.follows])
for project, msg, url in ret.messages:
self.inqueue.put(({
'taskid': utils.md5string(url),
'project': project,
'url': url,
'process': {
'callback': '_on_message',
}
}, {
'status_code': 200,
'url': url,
'save': (task['project'], msg),
}))
if response.error or ret.exception:
logger_func = logger.error
else:
logger_func = logger.info
logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
task['project'], task['taskid'],
task.get('url'), response.status_code, len(response.content),
ret.result, len(ret.follows), len(ret.messages), ret.exception))
return True
开发者ID:0xa-cc,项目名称:pyspider,代码行数:89,代码来源:processor.py
示例19: len
'time': process_time,
'follows': len(ret.follows),
'result': unicode(ret.result)[:self.RESULT_RESULT_LIMIT],
'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
'exception': ret.exception,
},
},
})
self.status_queue.put(status_pack)
for newtask in ret.follows:
self.newtask_queue.put(newtask)
for project, msg, url in ret.messages:
self.inqueue.put(({
'taskid': utils.md5string(url),
'project': project,
'url': url,
'process': {
'callback': '_on_message',
}
}, {
'status_code': 200,
'url': url,
'save': (task['project'], msg),
}))
if response.error or ret.exception:
logger_func = logger.error
else:
logger_func = logger.info
开发者ID:Chuck8080,项目名称:pyspider,代码行数:31,代码来源:processor.py
注:本文中的pyspider.libs.utils.md5string函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论