• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python linkextractor.create_linkextractor_from_specs函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中slybot.linkextractor.create_linkextractor_from_specs函数的典型用法代码示例。如果您正苦于以下问题:Python create_linkextractor_from_specs函数的具体用法?Python create_linkextractor_from_specs怎么用?Python create_linkextractor_from_specs使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了create_linkextractor_from_specs函数的16个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_simple

 def test_simple(self):
     specs = {"type": "html", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[0].text, 'Click here')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:8,代码来源:test_linkextractors.py


示例2: test_custom_withargs

 def test_custom_withargs(self):
     specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:8,代码来源:test_linkextractors.py


示例3: test_extra_params

 def test_extra_params(self):
     specs = {"type": "column", "value": 1, "delimiter": "|"}
     lextractor = create_linkextractor_from_specs(specs)
     response = TextResponse(url="http://www.example.com/", body=csvfeed2)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "http://www.example.com/path2")
开发者ID:plafl,项目名称:portia,代码行数:8,代码来源:test_linkextractors.py


示例4: test_header

 def test_header(self):
     specs = {"type": "column", "value": 1}
     lextractor = create_linkextractor_from_specs(specs)
     response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'http://www.example.com/path2')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:8,代码来源:test_linkextractors.py


示例5: test_default

 def test_default(self):
     specs = {"type": "regex", "value": ''}
     lextractor = create_linkextractor_from_specs(specs)
     text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?"
     response = UTF8TextResponse(url='http://www.example.com/', body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:9,代码来源:test_linkextractors.py


示例6: test_simple

 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
     html_page.headers["n_items"] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[0].text, "Click here")
开发者ID:plafl,项目名称:portia,代码行数:9,代码来源:test_linkextractors.py


示例7: test_custom

 def test_custom(self):
     specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"}
     lextractor = create_linkextractor_from_specs(specs)
     text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
     response = TextResponse(url="http://www.example.com/", body=text)
     links = list(lextractor.links_to_follow(response))
     self.assertEqual(len(links), 2)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[1].url, "https://www.example.com/path2")
开发者ID:plafl,项目名称:portia,代码行数:9,代码来源:test_linkextractors.py


示例8: test_sitemap

    def test_sitemap(self):
        specs = {"type": "sitemap", "value": ""}
        lextractor = create_linkextractor_from_specs(specs)
        links = list(lextractor.links_to_follow(self.sitemap))
        self.assertEqual(len(links), 3)
        self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/')

        links = list(lextractor.links_to_follow(self.sitemapindex))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:10,代码来源:test_linkextractors.py


示例9: _create_start_request_from_specs

 def _create_start_request_from_specs(self, info):
     url = info["url"]
     lspecs = info.get("link_extractor")
     if lspecs:
         linkextractor = create_linkextractor_from_specs(lspecs)
         def _callback(spider, response):
             for link in linkextractor.links_to_follow(response):
                 yield Request(url=link.url, callback=spider.parse)
         return Request(url=url, callback=_callback)
     return Request(url=url, callback=self.parse)
开发者ID:Kola0o0,项目名称:slybot,代码行数:10,代码来源:spider.py


示例10: handle_xml

 def handle_xml(self, response, seen):
     _type = content_type(response).subtype.split('+')[0]
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = XmlLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
开发者ID:fakegit,项目名称:portia,代码行数:12,代码来源:annotations.py


示例11: handle_xml

 def handle_xml(self, response, seen):
     _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
     _type = _type.groupdict()['type'] if _type else 'xml'
     try:
         link_extractor = create_linkextractor_from_specs({
             'type': _type, 'value': ''
         })
     except ValueError:
         link_extractor = SitemapLinkExtractor()
     for link in link_extractor.links_to_follow(response):
         request = self._filter_link(link, seen)
         if request:
             yield request
开发者ID:BenJamesbabala,项目名称:portia,代码行数:13,代码来源:annotations.py


示例12: test_start_urls

 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')
开发者ID:01-,项目名称:portia,代码行数:23,代码来源:test_linkextractors.py


示例13: test_xml_remove_namespaces

 def test_xml_remove_namespaces(self):
     specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:6,代码来源:test_linkextractors.py


示例14: test_atom

 def test_atom(self):
     specs = {"type": "atom", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.atom))
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, 'http://example.org/feed/')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:6,代码来源:test_linkextractors.py


示例15: test_xml

 def test_xml(self):
     specs = {"type": "xpath", "value": "//item/link/text()"}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.wikipedia.org/')
开发者ID:daqv,项目名称:portia-dashboard,代码行数:6,代码来源:test_linkextractors.py


示例16: test_rss

 def test_rss(self):
     specs = {"type": "rss", "value": ""}
     lextractor = create_linkextractor_from_specs(specs)
     links = list(lextractor.links_to_follow(self.response))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.wikipedia.org/")
开发者ID:plafl,项目名称:portia,代码行数:6,代码来源:test_linkextractors.py



注:本文中的slybot.linkextractor.create_linkextractor_from_specs函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python extraction.SlybotIBLExtractor类代码示例发布时间:2022-05-27
下一篇:
Python item.create_slybot_item_descriptor函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap