• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python url.urljoin_rfc函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中w3lib.url.urljoin_rfc函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin_rfc函数的具体用法?Python urljoin_rfc怎么用?Python urljoin_rfc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了urljoin_rfc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: parse

    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        for href in response.xpath('//table/tr/td/strong/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析pdf
        for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析翻页
        for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract():
            if ("page=" not in href  and "browse-date?top=" not in href ) or "itemsPerPage=" in href:
                continue

            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:30,代码来源:handle.py


示例2: parse_index

    def parse_index(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url,callback=self.parse_index)
            return
        base_url  = get_base_url(response)
        #解析期刊首页
        count = 0
        for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract():
            if href.startswith("Rss.ashx?"):
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            #self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            yield scrapy.Request(url=abs_url,callback=self.parse_content)
            count += 1
        self.log("Fuck %s %d"%(response.url,count),level=scrapy.log.INFO)

        #解析索引页翻页
        for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract():
            if "PageNo" not in href:
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse_index)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:wanfangdata.py


示例3: extract_links

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:30,代码来源:image.py


示例4: process_response

    def process_response(self, request, response, spider):
        if "dont_redirect" in request.meta:
            return response
        if request.method.upper() == "HEAD":
            if response.status in [301, 302, 303, 307] and "Location" in response.headers:
                redirected_url = urljoin_rfc(request.url, response.headers["location"])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider, response.status)
            else:
                return response

        if response.status in [302, 303] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider, "meta refresh")

        return response
开发者ID:saidimu,项目名称:scrapy,代码行数:28,代码来源:redirect.py


示例5: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
开发者ID:bihicheng,项目名称:scrapy,代码行数:10,代码来源:regex.py


示例6: parse

    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        count = 0
        for a in response.xpath('//a'):
            text = a.xpath("string(.)").extract()
            text = "".join(text).strip()
            if len(text) > 5 or "PDF" not in text:
                continue
            href = a.xpath("@href").extract()
            if len(href) != 1:
                continue
            href = href[0]
            if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
                onclick = a.xpath("@onclick").extract()[0]
                onclick = onclick.split(",")
                if len(onclick) < 2:
                    continue
                if onclick[0].startswith("showArticleFile"):
                    id = onclick[-1].split(")", 1)[0].replace("'", "")
                else:
                    id = onclick[1].split(")", 1)[0].replace("'", "")
                if "/CN/" in response.url:
                    pdf = response.url.split("/CN/", 1)[
                              0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                elif "/EN/" in response.url:
                    pdf = response.url.split("/EN/", 1)[
                              0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                else:
                    continue
            elif "attachType=PDF&id=" in href:

                abs_url = urljoin_rfc(response.url, href)
                pdf = abs_url
            else:
                continue
            # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
            # print pdf
            self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
            count += 1

        base_url = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            abs_url = safe_url_string(abs_url, encoding=response.encoding)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
        self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:53,代码来源:pdf.py


示例7: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors="replace")
            ret.append(link)

        return ret
开发者ID:netconstructor,项目名称:scrapy,代码行数:14,代码来源:lxmlparser.py


示例8: test_urljoin_rfc

 def test_urljoin_rfc(self):
     self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
                                  'http://example.com/some/newpath/test')
     self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
                                  'http://example.com/some/key/other')
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
开发者ID:LucianU,项目名称:w3lib,代码行数:14,代码来源:test_url.py


示例9: parse

    def parse(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        
        site = get_url_site(response.url)

        if site in self.parses:
            parser = self.parses[site]
            #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
            for item in parser.parse(response) :
                yield item
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()

            abs_url =urljoin_rfc(base_url,relative_url)
            #print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue            
            site = get_url_site(abs_url)
            yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:base.py


示例10: parse

    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            # self.log(response.headers,level=scrapy.log.INFO)
            yield scrapy.Request(response.url)
            return
        if response.__class__ != scrapy.http.HtmlResponse:
            return

        base_site = get_url_site(response.url)
        # print response.url,response.status
        base_url = response.url
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if not self.is_valid_url(relative_url):
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            # print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http", "https"]:
                continue
            site = get_url_site(abs_url)

            # yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
            if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
                continue
            self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
            yield scrapy.Request(abs_url)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:30,代码来源:base.py


示例11: parse_all

    def parse_all(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        base_site = get_url_site(base_url)

        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)

            filename = abs_url.split("?")[0].split("/")[-1]
            if filename :
                ctype  = filename.split(".")[-1].lower() 
            else:
                ctype = None
            if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
                continue

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

            site = get_url_site(abs_url)
            if site != base_site:
                continue
            if ctype in ["pdf","doc","docx","rtf",]:
                continue
            yield scrapy.Request(url=abs_url,callback=self.parse_all)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:31,代码来源:pdf.py


示例12: parse

    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if relative_url.startswith("javascript:"):
                continue
            if "mod=redirect" in relative_url or "redirect.php" in relative_url:
                continue
                
            abs_url =urljoin_rfc(base_url,relative_url)
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue  

            #yield NimeiItem(url=abs_url,furl=response.url)
            abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"])


            if self.PATTERN1.match(abs_url):
                abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
            if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="):
                
                yield scrapy.Request(abs_url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:29,代码来源:bbs.py


示例13: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:htmlparser.py


示例14: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:sgml.py


示例15: parse_zgyszz

    def parse_zgyszz(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        #base_site = get_url_site(base_url)
        if  "qklist/show-" in response.url:
            base_url  = get_base_url(response)

            downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
            relative_url = downLink.split("'")[1]

            abs_url = urljoin_rfc(base_url,relative_url)
            yield scrapy.Request(abs_url,callback=self.parse_zgyszz)

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            
            return
        if '/upload/qklist/' in response.url:
            yield self.baidu_rpc_request({"url":response.url,"src_id":22})
            return

        base_url  = response.url
        for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            #request.meta["dont_redirect"] = True
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
        
        for sel in response.xpath("//div[@class='flickr']/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue         
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:44,代码来源:pdf.py


示例16: parse

    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url)
            return
        base_url  = get_base_url(response)

        for href in response.xpath('//div[@class="center_bottom_list"]//a/@href').extract():
            if not self.is_valid_url(href):
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)

        #翻页
        for href in response.xpath('//div[@class="article_list_page"]//a/@href').extract():
            abs_url =urljoin_rfc(base_url,href)
            yield scrapy.Request(url=abs_url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:19,代码来源:sciencemeta-net.py


示例17: get_base_url

def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given html text, relative to the
    given base url. If no base url is found, the given base url is returned
    """
    text = str_to_unicode(text, encoding)
    baseurl = unicode_to_str(baseurl, encoding)
    m = _baseurl_re.search(text)
    if m:
        baseurl = urljoin_rfc(baseurl, m.group(1).encode(encoding))
    return safe_url_string(baseurl)
开发者ID:LucianU,项目名称:w3lib,代码行数:10,代码来源:html.py


示例18: _extract_requests

    def _extract_requests(self, response_text, response_url, response_encoding):
        """Extract requests with absolute urls"""
        self.reset()
        self.feed(response_text)
        self.close()

        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        self._make_absolute_urls(base_url, response_encoding)
        self._fix_link_text_encoding(response_encoding)

        return self.requests
开发者ID:bihicheng,项目名称:scrapy,代码行数:11,代码来源:reqext.py


示例19: parse_index

 def parse_index(self,response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     for href in response.xpath('//div[@class="az"]/ul/li/p/a/@href').extract():
         if "policy.php" in href:
             continue
         abs_url =urljoin_rfc(response.url,href)
         yield scrapy.Request(url=abs_url+"/article/latestArticlesByJournal")
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:11,代码来源:sciencemeta-net.py


示例20: parse_content

 def parse_content(self,response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         yield scrapy.Request(url=response.url,callback=self.parse_content)     
         return
     base_url  = get_base_url(response)
     #解析文章
     for href in response.xpath("//em/a/@href").extract():
         relative_url = href
         abs_url =urljoin_rfc(base_url,relative_url)            
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:12,代码来源:cqvip.py



注:本文中的w3lib.url.urljoin_rfc函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python util.str_to_unicode函数代码示例发布时间:2022-05-26
下一篇:
Python url.safe_url_string函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap