本文整理汇总了Python中slybot.extractors.apply_extractors函数的典型用法代码示例。如果您正苦于以下问题:Python apply_extractors函数的具体用法?Python apply_extractors怎么用?Python apply_extractors使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了apply_extractors函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_type_extractor
def test_type_extractor(self):
schema = {
"id": "test",
"properties": [('gender', {
'description': '',
'optional': True,
'type': 'number',
'vary': False,
})],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 1,
"field_name": "gender",
"type_extractor": "text"
},
2: {
"_id": 2,
"field_name": "gender",
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, [1, 2], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:alepharchives,项目名称:slybot,代码行数:27,代码来源:test_extractors.py
示例2: test_extractor_w_empty_string_extraction
def test_extractor_w_empty_string_extraction(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'text',
'vary': False,
},
'name': {
'required': True,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = SlybotIBLExtractor([
(self.template2, {'#default': descriptor}, '0.12.0')])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])
开发者ID:FFFFFurry,项目名称:portia,代码行数:26,代码来源:test_extractors.py
示例3: setup_bot
def setup_bot(self, settings, spec, items, extractors):
"""
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted((
[t.get('scrapes'), dict_to_page(t, 'annotated_body'),
t.get('extractors', []), t.get('version', '0.12.0')]
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda x: x[0])
self.item_classes = {}
self.template_scrapes = {template.get('page_id'): template['scrapes']
for template in spec.get('templates')}
self.html_link_extractor = HtmlLinkExtractor()
for schema_name, schema in items.items():
if schema_name not in self.item_classes:
if not schema.get('name'):
schema['name'] = schema_name
item_cls = SlybotItem.create_iblitem_class(schema)
self.item_classes[schema_name] = item_cls
# Create descriptors and apply additional extractors to fields
page_descriptor_pairs = []
self.schema_descriptors = {}
for default, template, template_extractors, v in _item_template_pages:
descriptors = OrderedDict()
for schema_name, schema in items.items():
item_descriptor = create_slybot_item_descriptor(schema,
schema_name)
apply_extractors(item_descriptor, template_extractors,
extractors)
descriptors[schema_name] = item_descriptor
descriptor = descriptors.values() or [{}]
descriptors['#default'] = descriptors.get(default, descriptor[0])
self.schema_descriptors[template.page_id] = descriptors['#default']
page_descriptor_pairs.append((template, descriptors, v))
add_extractors_to_descriptors(descriptors, extractors)
grouped = itertools.groupby(sorted(page_descriptor_pairs,
key=operator.itemgetter(2)),
lambda x: x[2] < '0.13.0')
self.extractors = []
for version, group in grouped:
if version:
self.extractors.append(
InstanceBasedLearningExtractor(
[(page, scrapes['#default'])
for page, scrapes, version in group]))
else:
self.extractors.append(SlybotIBLExtractor(list(group)))
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates']
if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor(
[(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self.build_url_filter(spec)
开发者ID:FrankieChan885,项目名称:portia,代码行数:60,代码来源:annotations.py
示例4: test_extractor_w_empty_string_extraction
def test_extractor_w_empty_string_extraction(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'text',
'vary': False,
},
'name': {
'required': True,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:1060460048,项目名称:portia,代码行数:25,代码来源:test_extractors.py
示例5: test_extractor_w_empty_string_extraction
def test_extractor_w_empty_string_extraction(self):
schema = {
"id": "test",
"properties": [
('gender', {
'description': '',
'optional': True,
'type': 'text',
'vary': False,
}),
('name', {
'description': '',
'optional': False,
'type': 'text',
'vary': False,
}),
],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 2,
"field_name": "gender",
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, [1], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:alepharchives,项目名称:slybot,代码行数:30,代码来源:test_extractors.py
示例6: __init__
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
self._item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self.start_urls = self.start_urls or spec.get('start_urls')
if isinstance(self.start_urls, basestring):
self.start_urls = self.start_urls.splitlines()
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
self.allowed_domains = self._get_allowed_domains(self._ipages)
self.build_url_filter(spec)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = get_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.login_requests = []
self.form_requests = []
for rdata in spec.get("init_requests", []):
if rdata["type"] == "login":
request = Request(url=rdata.pop("loginurl"), meta=rdata,
callback=self.parse_login_page, dont_filter=True)
self.login_requests.append(request)
elif rdata["type"] == "form":
self.generic_form = GenericForm(**kw)
self.form_requests.append(self.get_generic_form_start_request(rdata))
开发者ID:Big-Data,项目名称:slybot,代码行数:59,代码来源:spider.py
示例7: __init__
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
spec = deepcopy(spec)
for key, val in kw.items():
if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
val = val.splitlines()
spec[key] = val
self.i = time.time()
self.getProxyList()
self._item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
self.build_url_filter(spec)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.login_requests = []
self.form_requests = []
self._start_requests = []
self.generic_form = GenericForm(**kw)
self._create_init_requests(spec.get("init_requests", []))
self._process_start_urls(spec)
self.allowed_domains = spec.get('allowed_domains',
self._get_allowed_domains(self._ipages))
if not self.allowed_domains:
self.allowed_domains = None
开发者ID:lodow,项目名称:portia-proxy,代码行数:58,代码来源:spider.py
示例8: __init__
def __init__(self, name, spec, item_schemas, all_extractors, **kw):
super(IblSpider, self).__init__(name, **kw)
default_item = spec['scrapes']
self._default_schema = item_schemas[default_item]
if not self._default_schema:
self.log("Scraping unknown default item schema: %s" % default_item, \
log.WARNING)
self._item_template_pages = sorted((
[t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'),
t.get('extractors', [])] \
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self._ipages = [page for _, page, _ in self._item_template_pages]
self._fpages = [
dict_to_page(t, 'annotated_body')
for t in spec['templates'] if t.get('page_type', 'item') == 'form'
]
self.start_urls = self.start_urls or spec.get('start_urls')
if isinstance(self.start_urls, basestring):
self.start_urls = self.start_urls.splitlines()
self.link_extractor = LinkExtractor()
self.allowed_domains = self._get_allowed_domains(self._ipages)
self.build_url_filter(spec)
default_item_cls = get_iblitem_class(self._default_schema)
default_item_descriptor = create_slybot_item_descriptor(self._default_schema)
self.itemcls_info = {}
for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
schema = item_schemas[itemclass_name]
item_cls = get_iblitem_class(schema) if schema else default_item_cls
page_descriptor_pairs = []
for page, extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
apply_extractors(item_descriptor, extractors, all_extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
开发者ID:madberry,项目名称:slybot,代码行数:58,代码来源:spider.py
示例9: test_default_type_extractor
def test_default_type_extractor(self):
schema = {
'fields': {}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {"regular_expression": "Gender\\s+(Male|Female)"}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = SlybotIBLExtractor([
(self.template, {'#default': descriptor}, '0.12.0')])
self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
开发者ID:FFFFFurry,项目名称:portia,代码行数:13,代码来源:test_extractors.py
示例10: test_default_type_extractor
def test_default_type_extractor(self):
schema = {
'fields': {}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:1060460048,项目名称:portia,代码行数:14,代码来源:test_extractors.py
示例11: test_text_type_w_regex
def test_text_type_w_regex(self):
schema = {
"fields": {
'gender': {
'required': False,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = SlybotIBLExtractor([
(self.template, {'#default': descriptor}, '0.12.0')])
self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
开发者ID:FFFFFurry,项目名称:portia,代码行数:17,代码来源:test_extractors.py
示例12: setup_bot
def setup_bot(self, settings, spec, items, extractors):
"""
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
t.get('extractors', [])]
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])
self.itemcls_info = {}
if settings.get('AUTO_PAGINATION'):
self.html_link_extractor = PaginationExtractor()
else:
self.html_link_extractor = HtmlLinkExtractor()
for itemclass_name, triplets in groupby(_item_template_pages,
itemgetter(0)):
page_extractors_pairs = map(itemgetter(1, 2), triplets)
schema = items[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors,
extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates']
if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor(
[(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self.build_url_filter(spec)
开发者ID:BenJamesbabala,项目名称:portia,代码行数:46,代码来源:annotations.py
示例13: test_default_type_extractor
def test_default_type_extractor(self):
schema = {
"id": "test",
"properties": [],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 1,
"field_name": "gender",
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, [1], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:alepharchives,项目名称:slybot,代码行数:17,代码来源:test_extractors.py
示例14: setup_bot
def setup_bot(self, settings, spec, items, extractors):
"""
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted(
(
[t["scrapes"], dict_to_page(t, "annotated_body"), t.get("extractors", [])]
for t in spec["templates"]
if t.get("page_type", "item") == "item"
),
key=lambda pair: pair[0],
)
self.itemcls_info = {}
self.html_link_extractor = HtmlLinkExtractor()
self.rss_link_extractor = RssLinkExtractor()
for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)):
page_extractors_pairs = map(itemgetter(1, 2), triplets)
schema = items[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)
page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
apply_extractors(item_descriptor, template_extractors, extractors)
page_descriptor_pairs.append((page, item_descriptor))
extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
self.itemcls_info[itemclass_name] = {
"class": item_cls,
"descriptor": item_descriptor,
"extractor": extractor,
}
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
_links_item_descriptor = create_slybot_item_descriptor({"fields": {}})
self._links_ibl_extractor = (
InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
if _links_pages
else None
)
self.build_url_filter(spec)
开发者ID:hackoose,项目名称:portia,代码行数:45,代码来源:annotations.py
示例15: test_raw_type_w_regex
def test_raw_type_w_regex(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'raw',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {1: {
"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
}}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
开发者ID:1060460048,项目名称:portia,代码行数:18,代码来源:test_extractors.py
示例16: test_text_type_w_regex
def test_text_type_w_regex(self):
schema = {
"fields": {
'gender': {
'required': False,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {1: {
"regular_expression": "Gender\\s+(Male|Female)"
}}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:1060460048,项目名称:portia,代码行数:18,代码来源:test_extractors.py
示例17: setup_bot
def setup_bot(self, settings, spec, items, extractors):
"""
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted((
[t.get('scrapes'), dict_to_page(t, 'annotated_body'),
t.get('extractors', [])]
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
))
self.item_classes = {}
self.html_link_extractor = HtmlLinkExtractor()
for schema_name, schema in items.items():
if schema_name not in self.item_classes:
if not schema.get('name'):
schema['name'] = schema_name
item_cls = SlybotItem.create_iblitem_class(schema)
self.item_classes[schema_name] = item_cls
# Create descriptors and apply additional extractors to fields
page_descriptor_pairs = []
for default, template, template_extractors in _item_template_pages:
descriptors = OrderedDict()
for schema_name, schema in items.items():
item_descriptor = create_slybot_item_descriptor(schema,
schema_name)
apply_extractors(item_descriptor, template_extractors,
extractors)
descriptors[schema_name] = item_descriptor
descriptor = descriptors.values() or [{}]
descriptors['#default'] = descriptors.get(default, descriptor[0])
page_descriptor_pairs.append((template, descriptors))
self.extractors = SlybotIBLExtractor(page_descriptor_pairs)
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates']
if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor(
[(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self.build_url_filter(spec)
开发者ID:TimoC1982,项目名称:portia,代码行数:44,代码来源:annotations.py
示例18: test_raw_type_w_regex
def test_raw_type_w_regex(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'raw',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = SlybotIBLExtractor([
(self.template, {'#default': descriptor}, '0.12.0')])
self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
开发者ID:FFFFFurry,项目名称:portia,代码行数:19,代码来源:test_extractors.py
示例19: test_raw_type_w_regex
def test_raw_type_w_regex(self):
schema = {
"id": "test",
"properties": [('gender', {
'description': '',
'optional': True,
'type': 'raw',
'vary': False,
})],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {1: {
"_id": 1,
"field_name": "gender",
"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
}}
apply_extractors(descriptor, [1], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
开发者ID:alepharchives,项目名称:slybot,代码行数:20,代码来源:test_extractors.py
示例20: setup_bot
def setup_bot(self, settings, spec, items, extractors, logger):
"""
Perform any initialization needed for crawling using this plugin
"""
self.logger = logger
templates = map(self._get_annotated_template, spec['templates'])
_item_template_pages = sorted((
[t.get('scrapes'), dict_to_page(t, 'annotated_body'),
t.get('extractors', []), t.get('version', '0.12.0')]
for t in templates if t.get('page_type', 'item') == 'item'
), key=lambda x: x[0])
self.item_classes = {}
self.template_scrapes = {template.get('page_id'): template['scrapes']
for template in templates}
if (settings.get('AUTO_PAGINATION') or
spec.get('links_to_follow') == 'auto'):
self.html_link_extractor = PaginationExtractor()
else:
self.html_link_extractor = HtmlLinkExtractor()
for schema_name, schema in items.items():
if schema_name not in self.item_classes:
if not schema.get('name'):
schema['name'] = schema_name
item_cls = SlybotItem.create_iblitem_class(schema)
self.item_classes[schema_name] = item_cls
# Create descriptors and apply additional extractors to fields
page_descriptor_pairs = []
self.schema_descriptors = {}
for default, template, template_extractors, v in _item_template_pages:
descriptors = OrderedDict()
for schema_name, schema in items.items():
item_descriptor = create_slybot_item_descriptor(schema,
schema_name)
apply_extractors(item_descriptor, template_extractors,
extractors)
descriptors[schema_name] = item_descriptor
descriptor = descriptors.values() or [{}]
descriptors['#default'] = descriptors.get(default, descriptor[0])
self.schema_descriptors[template.page_id] = descriptors['#default']
page_descriptor_pairs.append((template, descriptors, v))
add_extractors_to_descriptors(descriptors, extractors)
grouped = itertools.groupby(sorted(page_descriptor_pairs,
key=operator.itemgetter(2)),
lambda x: x[2] < '0.13.0')
self.extractors = []
for version, group in grouped:
if version:
self.extractors.append(
InstanceBasedLearningExtractor(
[(page, scrapes['#default'])
for page, scrapes, version in group]))
else:
self.extractors.append(SlybotIBLExtractor(list(group)))
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in templates if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor(
[(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self.build_url_filter(spec)
# Clustering
self.template_names = [t.get('page_id') for t in spec['templates']]
if settings.get('PAGE_CLUSTERING'):
try:
import page_clustering
self.clustering = page_clustering.kmeans_from_samples(spec['templates'])
self.logger.info("Clustering activated")
except ImportError:
self.clustering = None
self.logger.warning(
"Clustering could not be used because it is not installed")
else:
self.clustering = None
开发者ID:NamiStudio,项目名称:portia,代码行数:79,代码来源:annotations.py
注:本文中的slybot.extractors.apply_extractors函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论