本文整理汇总了Python中slybot.generic_form.GenericForm类的典型用法代码示例。如果您正苦于以下问题:Python GenericForm类的具体用法?Python GenericForm怎么用?Python GenericForm使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了GenericForm类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_advanced_search_form_regex
def test_advanced_search_form_regex(self):
url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
form_descriptor = json.loads("""{
"type": "form",
"form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
"xpath": "//form[@name='adv_search_from']",
"fields": [
{
"xpath": ".//*[@name='_nkw']",
"type": "constants",
"value": ["Cars"]
},
{
"xpath": ".//*[@name='_in_kw']",
"type": "iterate",
"value": "[1-2]"
}
]
}""")
generic_form = GenericForm()
start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '2'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')]
self.assertEqual(start_requests, expected_requests)
开发者ID:01-,项目名称:portia,代码行数:25,代码来源:test_generic_form.py
示例2: __init__
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, basestring) and key in STRING_KEYS:
val = val.splitlines()
spec[key] = val
self._item_template_pages = sorted(
((t['scrapes'], t) for t in spec['templates']
if t.get('page_type', 'item') == 'item'), key=itemgetter(0))
self._templates = [templ for _, templ in self._item_template_pages]
self.plugins = IndexedDict()
for plugin_class, plugin_name in zip(load_plugins(settings),
load_plugin_names(settings)):
instance = plugin_class()
instance.setup_bot(settings, spec, item_schemas, all_extractors)
self.plugins[plugin_name] = instance
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get(
'allowed_domains',
self._get_allowed_domains(self._templates)
)
if not self.allowed_domains:
self.allowed_domains = None
开发者ID:nju520,项目名称:portia,代码行数:34,代码来源:spider.py
示例3: __init__
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
self._item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self.start_urls = self.start_urls or spec.get('start_urls')
if isinstance(self.start_urls, basestring):
self.start_urls = self.start_urls.splitlines()
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
self.allowed_domains = self._get_allowed_domains(self._ipages)
self.build_url_filter(spec)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = get_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.login_requests = []
self.form_requests = []
for rdata in spec.get("init_requests", []):
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page, dont_filter=True)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.generic_form = GenericForm(**kw)
self.form_requests.append(self.get_generic_form_start_request(rdata))
开发者ID:Big-Data,项目名称:slybot,代码行数:59,代码来源:spider.py
示例4: __init__
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
val = val.splitlines()
spec[key] = val
self.i = time.time()
self.getProxyList()
self._item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
self.build_url_filter(spec)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get('allowed_domains',
self._get_allowed_domains(self._ipages))
if not self.allowed_domains:
self.allowed_domains = None
开发者ID:lodow,项目名称:portia-proxy,代码行数:58,代码来源:spider.py
示例5: test_simple_search_form_with_named_parameter
def test_simple_search_form_with_named_parameter(self):
url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
form_descriptor = json.loads("""{
"type": "form",
"form_url": "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
"xpath": "//form[@name='adv_search_from']",
"fields": [
{
"name": "my_param",
"type": "constants",
"value": ["Cars"]
}
]
}""")
generic_form = GenericForm()
start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
expected_requests = [([('_in_kw', '1'), ('_udlo', ''), ('_ex_kw', ''), ('_nkw', ''), ('_ipg', '50'), ('_adv', '1'), ('_salic', '1'), ('_dmd', '1'), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_sop', '12'), (u'my_param', u'Cars'), ('_sasl', '')], 'http://www.ebay.com/sch/i.html', 'GET')]
self.assertEqual(start_requests, expected_requests)
开发者ID:9thSymfony,项目名称:slybot,代码行数:20,代码来源:test_generic_form.py
示例6: test_simple_search_form_with_file_type
def test_simple_search_form_with_file_type(self):
url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
form_descriptor = json.loads("""{
"type": "form",
"form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
"xpath": "//form[@name='adv_search_from']",
"fields": [
{
"name": "my_param",
"type": "inurl",
"value": "file://%s/test_params.txt",
"file_values": ["Cars", "Boats", "Houses", "Electronics"]
}
]
}""" % join(_PATH, "data"))
generic_form = GenericForm()
start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Cars'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Boats'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Houses'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Electronics'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')]
self.assertEqual(start_requests, expected_requests)
开发者ID:01-,项目名称:portia,代码行数:21,代码来源:test_generic_form.py
示例7: __init__
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
super(IblSpider, self).__init__(name, **kw)
self._job_id = settings.get('JOB', '')
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, six.string_types) and key in STRING_KEYS:
val = val.splitlines()
spec[key] = val
self._item_template_pages = sorted(
((t['scrapes'], t) for t in spec['templates']
if t.get('page_type', 'item') == 'item'), key=itemgetter(0))
self._templates = [templ for _, templ in self._item_template_pages]
self.plugins = IndexedDict()
for plugin_class, plugin_name in zip(load_plugins(settings),
load_plugin_names(settings)):
instance = plugin_class()
instance.setup_bot(settings, spec, item_schemas, all_extractors)
self.plugins[plugin_name] = instance
self.js_enabled = False
self.SPLASH_HOST = None
if settings.get('SPLASH_URL'):
self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
self.js_enabled = spec.get('js_enabled', False)
if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
settings.get('SPLASH_USER') is not None):
self.splash_auth = basic_auth_header(
settings.get('SPLASH_USER', ''),
settings.get('SPLASH_PASS', ''))
self._filter_js_urls = self._build_js_url_filter(spec)
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get(
'allowed_domains',
self._get_allowed_domains(self._templates)
)
self.page_actions = spec.get('page_actions', [])
if not self.allowed_domains:
self.allowed_domains = None
开发者ID:codegreencreative,项目名称:portia,代码行数:47,代码来源:spider.py
示例8: __init__
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
self.start_url_generators = {
'start_urls': StartUrls(),
'generated_urls': UrlGenerator(settings, kw)
}
self.generic_form = GenericForm(**kw)
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
self._add_spider_args_to_spec(spec, kw)
self.plugins = self._configure_plugins(
settings, spec, item_schemas, all_extractors)
self._configure_js(spec, settings)
self.login_requests, self.form_requests = [], []
self._start_requests = []
self._create_init_requests(spec)
self._process_start_urls(spec)
self._add_allowed_domains(spec)
self.page_actions = spec.get('page_actions', [])
开发者ID:01-,项目名称:portia,代码行数:19,代码来源:spider.py
示例9: IblSpider
class IblSpider(Spider):
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
super(IblSpider, self).__init__(name, **kw)
self._job_id = settings.get('JOB', '')
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, six.string_types) and key in STRING_KEYS:
val = val.splitlines()
spec[key] = val
self._item_template_pages = sorted(
((t['scrapes'], t) for t in spec['templates']
if t.get('page_type', 'item') == 'item'), key=itemgetter(0))
self._templates = [templ for _, templ in self._item_template_pages]
self.plugins = IndexedDict()
for plugin_class, plugin_name in zip(load_plugins(settings),
load_plugin_names(settings)):
instance = plugin_class()
instance.setup_bot(settings, spec, item_schemas, all_extractors)
self.plugins[plugin_name] = instance
self.js_enabled = False
self.SPLASH_HOST = None
if settings.get('SPLASH_URL'):
self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
self.js_enabled = spec.get('js_enabled', False)
if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
settings.get('SPLASH_USER') is not None):
self.splash_auth = basic_auth_header(
settings.get('SPLASH_USER', ''),
settings.get('SPLASH_PASS', ''))
self._filter_js_urls = self._build_js_url_filter(spec)
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get(
'allowed_domains',
self._get_allowed_domains(self._templates)
)
if not self.allowed_domains:
self.allowed_domains = None
def _process_start_urls(self, spec):
self.start_urls = spec.get('start_urls')
for url in self.start_urls:
request = Request(url, callback=self.parse, dont_filter=True)
self._add_splash_meta(request)
self._start_requests.append(request)
def _create_init_requests(self, spec):
for rdata in spec:
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page,
dont_filter=True)
self._add_splash_meta(request)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.form_requests.append(
self.get_generic_form_start_request(rdata)
)
elif rdata["type"] == "start":
self._start_requests.append(
self._create_start_request_from_specs(rdata)
)
def parse_login_page(self, response):
username = response.request.meta["username"]
password = response.request.meta["password"]
args, url, method = fill_login_form(response.url, response.body,
username, password)
return FormRequest(url, method=method, formdata=args,
callback=self.after_login, dont_filter=True)
def after_login(self, response):
for result in self.parse(response):
yield result
for req in self._start_requests:
yield req
def get_generic_form_start_request(self, form_descriptor):
file_fields = list(self.generic_form.get_url_field(form_descriptor))
if file_fields:
(field_index, field_descriptor) = file_fields.pop(0)
form_descriptor['field_index'] = field_index
return FormRequest(self.generic_form.get_value(field_descriptor),
meta=form_descriptor,
callback=self.parse_field_url_page,
dont_filter=True)
else:
return Request(url=form_descriptor.pop("form_url"),
meta=form_descriptor, callback=self.parse_form_page,
dont_filter=True)
#.........这里部分代码省略.........
开发者ID:PrasannaVenkadesh,项目名称:portia,代码行数:101,代码来源:spider.py
示例10: IblSpider
class IblSpider(Spider):
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, basestring) and key in STRING_KEYS:
val = val.splitlines()
spec[key] = val
self._item_template_pages = sorted(
((t['scrapes'], t) for t in spec['templates']
if t.get('page_type', 'item') == 'item'), key=itemgetter(0))
self._templates = [templ for _, templ in self._item_template_pages]
self.plugins = IndexedDict()
for plugin_class, plugin_name in zip(load_plugins(settings),
load_plugin_names(settings)):
instance = plugin_class()
instance.setup_bot(settings, spec, item_schemas, all_extractors)
self.plugins[plugin_name] = instance
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get(
'allowed_domains',
self._get_allowed_domains(self._templates)
)
if not self.allowed_domains:
self.allowed_domains = None
def _process_start_urls(self, spec):
self.start_urls = spec.get('start_urls')
for url in self.start_urls:
self._start_requests.append(Request(url, callback=self.parse,
dont_filter=True))
def _create_init_requests(self, spec):
for rdata in spec:
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page,
dont_filter=True)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.form_requests.append(
self.get_generic_form_start_request(rdata)
)
elif rdata["type"] == "start":
self._start_requests.append(
self._create_start_request_from_specs(rdata)
)
def parse_login_page(self, response):
username = response.request.meta["username"]
password = response.request.meta["password"]
args, url, method = fill_login_form(response.url, response.body,
username, password)
return FormRequest(url, method=method, formdata=args,
callback=self.after_login, dont_filter=True)
def after_login(self, response):
for result in self.parse(response):
yield result
for req in self._start_requests:
yield req
def get_generic_form_start_request(self, form_descriptor):
file_fields = list(self.generic_form.get_url_field(form_descriptor))
if file_fields:
(field_index, field_descriptor) = file_fields.pop(0)
form_descriptor['field_index'] = field_index
return FormRequest(self.generic_form.get_value(field_descriptor),
meta=form_descriptor,
callback=self.parse_field_url_page,
dont_filter=True)
else:
return Request(url=form_descriptor.pop("form_url"),
meta=form_descriptor, callback=self.parse_form_page,
dont_filter=True)
def parse_field_url_page(self, response):
form_descriptor = response.request.meta
field_index = form_descriptor['field_index']
field_descriptor = form_descriptor['fields'][field_index]
self.generic_form.set_values_url_field(field_descriptor, response.body)
yield self.get_generic_form_start_request(form_descriptor)
def parse_form_page(self, response):
fill_form = self.generic_form.fill_generic_form
try:
for (args, url, method) in fill_form(response.url, response.body,
response.request.meta):
yield FormRequest(url, method=method, formdata=args,
#.........这里部分代码省略.........
开发者ID:nju520,项目名称:portia,代码行数:101,代码来源:spider.py
示例11: IblSpider
class IblSpider(BaseSpider):
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
self._item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
self.build_url_filter(spec)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get('allowed_domains',
self._get_allowed_domains(self._ipages))
if not self.allowed_domains:
self.allowed_domains = None
def _process_start_urls(self, spec):
self.start_urls = self.start_urls or spec.get('start_urls')
if isinstance(self.start_urls, basestring):
self.start_urls = self.start_urls.splitlines()
for url in self.start_urls:
self._start_requests.append(Request(url, callback=self.parse, dont_filter=True))
def _create_init_requests(self, spec):
for rdata in spec:
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page, dont_filter=True)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.form_requests.append(self.get_generic_form_start_request(rdata))
elif rdata["type"] == "start":
self._start_requests.append(self._create_start_request_from_specs(rdata))
def parse_login_page(self, response):
username = response.request.meta["username"]
password = response.request.meta["password"]
args, url, method = fill_login_form(response.url, response.body, username, password)
return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True)
def after_login(self, response):
for result in self.parse(response):
yield result
for req in self._start_requests:
yield req
def get_generic_form_start_request(self, form_descriptor):
file_fields = list(self.generic_form.get_url_field(form_descriptor))
if file_fields:
(field_index, field_descriptor) = file_fields.pop(0)
form_descriptor['field_index'] = field_index
return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor,
callback=self.parse_field_url_page, dont_filter=True)
else:
return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor,
callback=self.parse_form_page, dont_filter=True)
def parse_field_url_page(self, response):
form_descriptor = response.request.meta
field_index = form_descriptor['field_index']
field_descriptor = form_descriptor['fields'][field_index]
#.........这里部分代码省略.........
开发者ID:Kola0o0,项目名称:slybot,代码行数:101,代码来源:spider.py
示例12: IblSpider
class IblSpider(SitemapSpider):
def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
**kw):
self.start_url_generators = {
'start_urls': IdentityGenerator(),
'generated_urls': UrlGenerator(settings, kw),
'url': IdentityGenerator(),
'generated': FragmentGenerator(),
# 'feed_urls': FeedUrls(self, settings, kw)
}
self.generic_form = GenericForm(**kw)
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
self._add_spider_args_to_spec(spec, kw)
self.plugins = self._configure_plugins(
settings, spec, item_schemas, all_extractors)
self._configure_js(spec, settings)
self.login_requests, self.form_requests = [], []
self._start_urls = self._create_start_urls(spec)
self._start_requests = self._create_start_requests(spec)
self._create_init_requests(spec)
self._add_allowed_domains(spec)
self.page_actions = spec.get('page_actions', [])
def _add_spider_args_to_spec(self, spec, args):
for key, val in args.items():
if isinstance(val, six.string_types) and key in STRING_KEYS:
val = val.splitlines()
spec[key] = val
def _create_start_urls(self, spec):
url_type = spec.get('start_urls_type', 'start_urls')
return StartUrlCollection(
arg_to_iter(spec[url_type]),
self.start_url_generators,
url_type
)
def _create_start_requests(self, spec):
init_requests = spec.get('init_requests', [])
for rdata in init_requests:
if rdata["type"] == "start":
yield self._create_start_request_from_specs(rdata)
for start_url in self._start_urls:
if not isinstance(start_url, Request):
start_url = Request(start_url, callback=self.parse,
dont_filter=True)
yield self._add_splash_meta(start_url)
def _create_init_requests(self, spec):
init_requests = spec.get('init_requests', [])
for rdata in init_requests:
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page,
dont_filter=True)
self._add_splash_meta(request)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.form_requests.append(
self.get_generic_form_start_request(rdata)
)
def _add_allowed_domains(self, spec):
self.allowed_domains = spec.get('allowed_domains', [])
if self.allowed_domains is not None and not self.allowed_domains:
self.allowed_domains = self._get_allowed_domains(spec)
def parse_login_page(self, response):
username = response.request.meta["username"]
password = response.request.meta["password"]
args, url, method = fill_login_form(response.url, response.body,
username, password)
return FormRequest(url, method=method, formdata=args,
callback=self.after_login, dont_filter=True)
def after_login(self, response):
for result in self.parse(response):
yield result
for req in self._start_requests:
yield req
def get_generic_form_start_request(self, form_descriptor):
file_fields = list(self.generic_form.get_url_field(form_descriptor))
if file_fields:
(field_index, field_descriptor) = file_fields.pop(0)
form_descriptor['field_index'] = field_index
return FormRequest(self.generic_form.get_value(field_descriptor),
meta=form_descriptor,
callback=self.parse_field_url_page,
dont_filter=True)
else:
return Request(url=form_descriptor.pop("form_url"),
meta=form_descriptor, callback=self.parse_form_page,
dont_filter=True)
def parse_field_url_page(self, response):
#.........这里部分代码省略.........
开发者ID:NamiStudio,项目名称:portia,代码行数:101,代码来源:spider.py
注:本文中的slybot.generic_form.GenericForm类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论