本文整理汇总了Python中wpull.backport.logging.__函数的典型用法代码示例。如果您正苦于以下问题:Python __函数的具体用法?Python __怎么用?Python __使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了__函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: snapshot
def snapshot(self, remote, html_path=None, render_path=None):
'''Take HTML and PDF snapshot.'''
content = yield remote.eval('page.content')
url = yield remote.eval('page.url')
if html_path:
_logger.debug(__('Saving snapshot to {0}.', html_path))
dir_path = os.path.abspath(os.path.dirname(html_path))
if not os.path.exists(dir_path):
os.makedirs(dir_path)
with open(html_path, 'wb') as out_file:
out_file.write(content.encode('utf-8'))
if self._warc_recorder:
self._add_warc_snapshot(html_path, 'text/html', url)
if render_path:
_logger.debug(__('Saving snapshot to {0}.', render_path))
yield remote.call('page.render', render_path)
if self._warc_recorder:
self._add_warc_snapshot(render_path, 'application/pdf', url)
raise tornado.gen.Return(content)
开发者ID:mback2k,项目名称:wpull,代码行数:26,代码来源:processor.py
示例2: process
def process(self, session: AppSession):
self._debug_log_registered_hooks(session)
internal_plugin_path = get_package_filename(os.path.join('application', 'plugins'))
plugin_locations = [internal_plugin_path]
plugin_filenames = []
if session.args.plugin_script:
plugin_filenames.append(session.args.plugin_script)
locator = PluginLocator(plugin_locations, plugin_filenames)
session.plugin_manager = PluginManager(plugin_locator=locator)
session.plugin_manager.collectPlugins()
for plugin_info in session.plugin_manager.getAllPlugins():
if plugin_info.path.startswith(internal_plugin_path):
_logger.debug(__(
_('Found plugin {name} from {filename}.'),
filename=plugin_info.path,
name=plugin_info.name
))
else:
_logger.info(__(
_('Found plugin {name} from {filename}.'),
filename=plugin_info.path,
name=plugin_info.name
))
plugin_info.plugin_object.app_session = session
if plugin_info.plugin_object.should_activate():
session.plugin_manager.activatePluginByName(plugin_info.name)
self._connect_plugin_hooks(session, plugin_info.plugin_object)
开发者ID:Super-Rad,项目名称:wpull,代码行数:34,代码来源:plugin.py
示例3: _polling_sleep
def _polling_sleep(cls, resource_monitor, log=False):
for counter in itertools.count():
resource_info = resource_monitor.check()
if not resource_info:
if log and counter:
_logger.info(_('Situation cleared.'))
break
if log and counter % 15 == 0:
if resource_info.path:
_logger.warning(__(
_('Low disk space on {path} ({size} free).'),
path=resource_info.path,
size=wpull.string.format_size(resource_info.free)
))
else:
_logger.warning(__(
_('Low memory ({size} free).'),
size=wpull.string.format_size(resource_info.free)
))
_logger.warning(_('Waiting for operator to clear situation.'))
yield from asyncio.sleep(60)
开发者ID:Super-Rad,项目名称:wpull,代码行数:26,代码来源:resmon.py
示例4: _scrape_document
def _scrape_document(self, request, response, url_item):
to_native = self.to_script_native_type
url_info_dict = to_native(request.url_info.to_dict())
document_info_dict = to_native(response.body.to_dict())
filename = to_native(response.body.content_file.name)
new_url_dicts = self.callbacks.get_urls(
filename, url_info_dict, document_info_dict)
_logger.debug(__('Hooked scrape returned {0}', new_url_dicts))
if not new_url_dicts:
return
if to_native(1) in new_url_dicts:
# Lua doesn't have sequences
for i in itertools.count(1):
new_url_dict = new_url_dicts[to_native(i)]
_logger.debug(__('Got lua new url info {0}', new_url_dict))
if new_url_dict is None:
break
self._add_hooked_url(url_item, new_url_dict)
else:
for new_url_dict in new_url_dicts:
self._add_hooked_url(url_item, new_url_dict)
开发者ID:mback2k,项目名称:wpull,代码行数:28,代码来源:hook.py
示例5: _check_resource_monitor
def _check_resource_monitor(self):
if not self._resource_monitor:
return
for counter in itertools.count():
resource_info = self._resource_monitor.check()
if not resource_info:
if counter:
_logger.info(_('Situation cleared.'))
break
if counter % 15 == 0:
if resource_info.path:
_logger.warning(__(
_('Low disk space on {path} ({size} free).'),
path=resource_info.path,
size=wpull.string.format_size(resource_info.free)
))
else:
_logger.warning(__(
_('Low memory ({size} free).'),
size=wpull.string.format_size(resource_info.free)
))
_logger.warning(_('Waiting for operator to clear situation.'))
yield From(trollius.sleep(60))
开发者ID:Willianvdv,项目名称:wpull,代码行数:28,代码来源:engine.py
示例6: _read_input_urls
def _read_input_urls(cls, session: AppSession, default_scheme='http'):
'''Read the URLs provided by the user.'''
url_string_iter = session.args.urls or ()
# FIXME: url rewriter isn't created yet
url_rewriter = session.factory.get('URLRewriter')
if session.args.input_file:
if session.args.force_html:
lines = cls._input_file_as_html_links(session)
else:
lines = cls._input_file_as_lines(session)
url_string_iter = itertools.chain(url_string_iter, lines)
base_url = session.args.base
for url_string in url_string_iter:
_logger.debug(__('Parsing URL {0}', url_string))
if base_url:
url_string = wpull.url.urljoin(base_url, url_string)
url_info = wpull.url.URLInfo.parse(
url_string, default_scheme=default_scheme)
_logger.debug(__('Parsed URL {0}', url_info))
if url_rewriter:
# TODO: this logic should be a hook
url_info = url_rewriter.rewrite(url_info)
_logger.debug(__('Rewritten URL {0}', url_info))
yield url_info
开发者ID:Super-Rad,项目名称:wpull,代码行数:34,代码来源:database.py
示例7: _make_socket
def _make_socket(self):
'''Make and wrap the socket with an IOStream.'''
host, port = self._original_address
family, self._resolved_address = yield self._resolver.resolve(
host, port)
self._socket = socket.socket(family, socket.SOCK_STREAM)
_logger.debug(__('Socket to {0}/{1}.', family, self._resolved_address))
if self._params.bind_address:
_logger.debug(__(
'Binding socket to {0}', self._params.bind_address
))
self._socket.bind(self._params.bind_address)
if self._ssl:
self._io_stream = SSLIOStream(
self._socket,
max_buffer_size=self._params.buffer_size,
rw_timeout=self._params.read_timeout,
ssl_options=self._params.ssl_options or {},
server_hostname=host,
)
else:
self._io_stream = IOStream(
self._socket,
rw_timeout=self._params.read_timeout,
max_buffer_size=self._params.buffer_size,
)
self._io_stream.set_close_callback(self._stream_closed_callback)
开发者ID:nwpu063291,项目名称:wpull,代码行数:33,代码来源:connection.py
示例8: resolve_all
def resolve_all(self, host, port=0):
'''Resolve hostname and return a list of results.
Args:
host (str): The hostname.
port (int): The port number.
Returns:
list: A list of tuples where each tuple contains the family and
the socket address. See :method:`resolve` for the socket address
format.
'''
_logger.debug(__('Lookup address {0} {1}.', host, port))
host = self._lookup_hook(host, port)
results = None
if self._cache:
results = self._get_cache(host, port, self._family)
if results is None:
results = yield From(self._resolve_from_network(host, port))
if self._cache:
self._put_cache(host, port, results)
if not results:
raise DNSNotFound(
"DNS resolution for {0} did not return any results."
.format(repr(host))
)
_logger.debug(__('Resolved addresses: {0}.', results))
raise Return(results)
开发者ID:Willianvdv,项目名称:wpull,代码行数:35,代码来源:dns.py
示例9: process
def process(self, item_session: ItemSession, request, response, file_writer_session):
'''Process PhantomJS.
Coroutine.
'''
if response.status_code != 200:
return
if not HTMLReader.is_supported(request=request, response=response):
return
_logger.debug('Starting PhantomJS processing.')
self._file_writer_session = file_writer_session
# FIXME: this is a quick hack for crashes. See #137.
attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))
for dummy in range(attempts):
try:
yield from self._run_driver(item_session, request, response)
except asyncio.TimeoutError:
_logger.warning(_('Waiting for page load timed out.'))
break
except PhantomJSCrashed as error:
_logger.exception(__('PhantomJS crashed: {}', error))
else:
break
else:
_logger.warning(__(
_('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
url=request.url_info.url
))
开发者ID:Super-Rad,项目名称:wpull,代码行数:33,代码来源:phantomjs.py
示例10: control
def control(self, remote):
'''Scroll the page.'''
num_scrolls = self._num_scrolls
if self._smart_scroll:
is_page_dynamic = yield remote.call('isPageDynamic')
if not is_page_dynamic:
num_scrolls = 0
url = yield remote.eval('page.url')
total_scroll_count = 0
for scroll_count in range(num_scrolls):
_logger.debug(__('Scrolling page. Count={0}.', scroll_count))
pre_scroll_counter_values = remote.resource_counter.values()
scroll_position = yield remote.eval('page.scrollPosition')
scroll_position['top'] += self._viewport_size[1]
yield self.scroll_to(remote, 0, scroll_position['top'])
total_scroll_count += 1
self._log_action('wait', self._wait_time)
yield wpull.async.sleep(self._wait_time)
post_scroll_counter_values = remote.resource_counter.values()
_logger.debug(__(
'Counter values pre={0} post={1}',
pre_scroll_counter_values,
post_scroll_counter_values
))
if post_scroll_counter_values == pre_scroll_counter_values \
and self._smart_scroll:
break
for dummy in range(remote.resource_counter.pending):
if remote.resource_counter.pending:
self._log_action('wait', self._wait_time)
yield wpull.async.sleep(self._wait_time)
else:
break
yield self.scroll_to(remote, 0, 0)
_logger.info(__(
gettext.ngettext(
'Scrolled page {num} time.',
'Scrolled page {num} times.',
total_scroll_count,
), num=total_scroll_count
))
if self._warc_recorder:
self._add_warc_action_log(url)
开发者ID:mback2k,项目名称:wpull,代码行数:59,代码来源:processor.py
示例11: run
def run(self):
scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html')
action_log_path = self._get_temp_path('phantom-action', suffix='.txt')
event_log_path = self._get_temp_path('phantom-event', suffix='.txt')
snapshot_paths = [scrape_snapshot_path]
snapshot_paths.extend(self._get_snapshot_paths())
url = self._item_session.url_record.url
driver_params = PhantomJSDriverParams(
url=url,
snapshot_paths=snapshot_paths,
wait_time=self._params.wait_time,
num_scrolls=self._params.num_scrolls,
smart_scroll=self._params.smart_scroll,
snapshot=self._params.snapshot,
viewport_size=self._params.viewport_size,
paper_size=self._params.paper_size,
event_log_filename=event_log_path,
action_log_filename=action_log_path,
custom_headers=self._params.custom_headers,
page_settings=self._params.page_settings,
)
driver = self._phantomjs_driver_factory(params=driver_params)
_logger.info(__(
_('PhantomJS fetching ‘{url}’.'),
url=url
))
with contextlib.closing(driver):
yield from driver.start()
# FIXME: we don't account that things might be scrolling and
# downloading so it might not be a good idea to timeout like
# this
if self._params.load_time:
yield from asyncio.wait_for(
driver.process.wait(), self._params.load_time
)
else:
yield from driver.process.wait()
if driver.process.returncode != 0:
raise PhantomJSCrashed(
'PhantomJS exited with code {}'
.format(driver.process.returncode)
)
if self._warc_recorder:
self._add_warc_action_log(action_log_path, url)
for path in snapshot_paths:
self._add_warc_snapshot(path, url)
_logger.info(__(
_('PhantomJS fetched ‘{url}’.'),
url=url
))
开发者ID:Super-Rad,项目名称:wpull,代码行数:58,代码来源:phantomjs.py
示例12: write_record
def write_record(self, record):
'''Append the record to the WARC file.'''
# FIXME: probably not a good idea to modifiy arguments passed to us
# TODO: add extra gzip headers that wget uses
record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
WARCRecord.WARC_RECORD_ID]
_logger.debug(__('Writing WARC record {0}.',
record.fields['WARC-Type']))
if self._params.compress:
open_func = gzip.GzipFile
else:
open_func = open
# Use getsize to get actual file size. Avoid tell() because it may
# not be the raw file position.
if os.path.exists(self._warc_filename):
before_offset = os.path.getsize(self._warc_filename)
else:
before_offset = 0
journal_filename = self._warc_filename + '-wpullinc'
with open(journal_filename, 'w') as file:
file.write('wpull-journal-version:1\n')
file.write('offset:{}\n'.format(before_offset))
try:
with open_func(self._warc_filename, mode='ab') as out_file:
for data in record:
out_file.write(data)
except (OSError, IOError) as error:
_logger.info(__(
_('Rolling back file {filename} to length {length}.'),
filename=self._warc_filename, length=before_offset
))
with open(self._warc_filename, mode='wb') as out_file:
out_file.truncate(before_offset)
raise error
finally:
os.remove(journal_filename)
after_offset = os.path.getsize(self._warc_filename)
if self._cdx_filename:
raw_file_offset = before_offset
raw_file_record_size = after_offset - before_offset
self._write_cdx_field(
record, raw_file_record_size, raw_file_offset
)
开发者ID:asergi,项目名称:wpull,代码行数:53,代码来源:warc.py
示例13: _load_ca_certs
def _load_ca_certs(cls, session: AppSession, clean: bool=True):
'''Load the Certificate Authority certificates.
'''
args = session.args
if session.ca_certs_filename:
return session.ca_certs_filename
certs = set()
if args.use_internal_ca_certs:
pem_filename = os.path.join(
os.path.dirname(__file__), '..', '..', 'cert', 'ca-bundle.pem'
)
certs.update(cls._read_pem_file(pem_filename, from_package=True))
if args.ca_directory:
if os.path.isdir(args.ca_directory):
for filename in os.listdir(args.ca_directory):
if os.path.isfile(filename):
certs.update(cls._read_pem_file(filename))
else:
_logger.warning(__(
_('Certificate directory {path} does not exist.'),
path=args.ca_directory
))
if args.ca_certificate:
if os.path.isfile(args.ca_certificate):
certs.update(cls._read_pem_file(args.ca_certificate))
else:
_logger.warning(__(
_('Certificate file {path} does not exist.'),
path=args.ca_certificate
))
session.ca_certs_filename = certs_filename = tempfile.mkstemp(
suffix='.pem', prefix='tmp-wpull-')[1]
def clean_certs_file():
os.remove(certs_filename)
if clean:
atexit.register(clean_certs_file)
with open(certs_filename, 'w+b') as certs_file:
for cert in certs:
certs_file.write(cert)
_logger.debug('CA certs loaded.')
开发者ID:Super-Rad,项目名称:wpull,代码行数:50,代码来源:sslcontext.py
示例14: _process_url_item
def _process_url_item(self, url_item):
'''Process an item.
Args:
url_item (:class:`.item.URLItem`): The item to process.
This function calls :meth:`.processor.BaseProcessor.process`.
'''
_logger.debug(__('Begin session for {0} {1}.',
url_item.url_record, url_item.url_info))
yield self._processor.process(url_item)
_logger.debug(__('End session for {0} {1}.',
url_item.url_record, url_item.url_info))
开发者ID:mback2k,项目名称:wpull,代码行数:15,代码来源:engine.py
示例15: _read_content
def _read_content(self, response, original_url_info):
'''Read response and parse the contents into the pool.'''
data = response.body.read(4096)
url_info = original_url_info
try:
self._robots_txt_pool.load_robots_txt(url_info, data)
except ValueError:
_logger.warning(__(
_('Failed to parse {url} for robots exclusion rules. '
'Ignoring.'), url_info.url))
self._accept_as_blank(url_info)
else:
_logger.debug(__('Got a good robots.txt for {0}.',
url_info.url))
开发者ID:Willianvdv,项目名称:wpull,代码行数:15,代码来源:robots.py
示例16: _connect
def _connect(self):
'''Connect the socket if not already connected.'''
if self.connected:
# Reset the callback so the context does not leak to another
self._io_stream.set_close_callback(self._stream_closed_callback)
return
yield self._make_socket()
_logger.debug(__('Connecting to {0}.', self._resolved_address))
try:
yield self._io_stream.connect(
self._resolved_address, timeout=self._params.connect_timeout
)
except (tornado.netutil.SSLCertificateError,
SSLVerficationError) as error:
raise SSLVerficationError('Certificate error: {error}'.format(
error=error)) from error
except (ssl.SSLError, socket.error) as error:
if error.errno == errno.ECONNREFUSED:
raise ConnectionRefused('Connection refused: {error}'.format(
error=error)) from error
else:
raise NetworkError('Connection error: {error}'.format(
error=error)) from error
else:
_logger.debug('Connected.')
开发者ID:nwpu063291,项目名称:wpull,代码行数:27,代码来源:connection.py
示例17: scrape
def scrape(self, request, response, link_type=None):
if not self.is_supported(request=request, response=response):
return
if link_type and link_type != LinkType.css:
return
link_contexts = set()
base_url = request.url_info.url
encoding = self._encoding_override or \
detect_response_encoding(response)
try:
with wpull.util.reset_file_offset(response.body):
for link, context in self.iter_processed_links(
response.body, encoding, base_url, context=True):
if context == 'import':
link_type = LinkType.css
else:
link_type = LinkType.media
link_contexts.add(LinkContext(link, inline=True, link_type=link_type))
except UnicodeError as error:
_logger.warning(__(
_('Failed to read document at ‘{url}’: {error}'),
url=request.url_info.url, error=error
))
return ScrapeResult(link_contexts, encoding)
开发者ID:Willianvdv,项目名称:wpull,代码行数:29,代码来源:css.py
示例18: _run_worker
def _run_worker(self):
'''Run a single consumer.
Coroutine.
'''
_logger.debug('Worker start.')
while True:
priority, item = yield From(self._item_queue.get())
if item == self.POISON_PILL:
_logger.debug('Worker quitting.')
return
else:
_logger.debug(__('Processing item {0}.', item))
self._item_get_semaphore.release()
self._token_queue.get_nowait()
yield From(self._process_item(item))
self._token_queue.task_done()
if os.environ.get('OBJGRAPH_DEBUG'):
import gc
import objgraph
gc.collect()
objgraph.show_most_common_types(25)
if os.environ.get('FILE_LEAK_DEBUG'):
import subprocess
output = subprocess.check_output(
['lsof', '-p', str(os.getpid()), '-n'])
for line in output.decode('ascii', 'replace').split('\n'):
if 'REG' in line and \
(os.getcwd() in line or '/tmp/' in line):
print('FILELEAK', line)
开发者ID:Willianvdv,项目名称:wpull,代码行数:34,代码来源:engine.py
示例19: _get_next_url_record
def _get_next_url_record(self):
'''Return the next available URL from the URL table.
This function will return items marked as "todo" and then items
marked as "error". As a consequence, items experiencing errors will
be done last.
Returns:
:class:`.item.URLRecord`.
'''
_logger.debug('Get next URL todo.')
try:
url_record = self._url_table.check_out(Status.todo)
except NotFound:
url_record = None
if not url_record:
try:
_logger.debug('Get next URL error.')
url_record = self._url_table.check_out(Status.error)
except NotFound:
url_record = None
_logger.debug(__('Return record {0}.', url_record))
return url_record
开发者ID:Willianvdv,项目名称:wpull,代码行数:27,代码来源:engine.py
示例20: _read_response_by_length
def _read_response_by_length(self, response):
'''Read the connection specified by a length.'''
_logger.debug('Reading body by length.')
try:
body_size = int(response.fields['Content-Length'])
if body_size < 0:
raise ValueError('Content length cannot be negative.')
except ValueError as error:
_logger.warning(__(
_('Invalid content length: {error}'), error=error
))
yield self._read_response_until_close(response)
return
def callback(data):
self._events.response_data.fire(data)
response.body.content_file.write(self._decompress_data(data))
yield self._io_stream.read_bytes(
body_size, streaming_callback=callback,
)
response.body.content_file.write(self._flush_decompressor())
开发者ID:nwpu063291,项目名称:wpull,代码行数:27,代码来源:connection.py
注:本文中的wpull.backport.logging.__函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论