• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python html.replace_entities函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中w3lib.html.replace_entities函数的典型用法代码示例。如果您正苦于以下问题:Python replace_entities函数的具体用法?Python replace_entities怎么用?Python replace_entities使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了replace_entities函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_regular

 def test_regular(self):
     # regular conversions
     self.assertEqual(replace_entities(u'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities(b'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:8,代码来源:test_html.py


示例2: test_illegal_entities

 def test_illegal_entities(self):
     self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False),
                      u'a < b &illegal; c &#12345678; six')
     self.assertEqual(replace_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                      u'a < b  c  six')
     self.assertEqual(replace_entities('x&#x2264;y'), u'x\u2264y')
     self.assertEqual(replace_entities('x&#157;y'), u'xy')
     self.assertEqual(replace_entities('x&#157;y', remove_illegal=False), u'x&#157;y')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:8,代码来源:test_html.py


示例3: clean_url

 def clean_url(url):
     clean_url = ''
     try:
         clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
     except ValueError:
         pass
     return clean_url
开发者ID:AugustLONG,项目名称:scrapy,代码行数:7,代码来源:regex.py


示例4: text

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
开发者ID:scrapy,项目名称:scrapely,代码行数:30,代码来源:extractors.py


示例5: test_missing_semicolon

 def test_missing_semicolon(self):
     for entity, result in (
             ('&lt&lt!', '<<!',),
             ('&LT!', '<!',),
             ('&#X41 ', 'A ',),
             ('&#x41!', 'A!',),
             ('&#x41h', 'Ah',),
             ('&#65!', 'A!',),
             ('&#65x', 'Ax',),
             ('&sup3!', u'\u00B3!',),
             ('&Aacute!', u'\u00C1!',),
             ('&#9731!', u'\u2603!',),
             ('&#153', u'\u2122',),
             ('&#x99', u'\u2122',),
             ):
         self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
         self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
开发者ID:Preetwinder,项目名称:w3lib,代码行数:17,代码来源:test_html.py


示例6: extract_raw_text

def extract_raw_text(html):
    text = replace_entities(html)
    text = re_clean_blanks.sub(u' ', text)
    text = re_clean_comments.sub(u' ', text)
    text = re_clean_javascript.sub(u' ', text)
    text = re_clean_style.sub(u' ', text)
    text = re_clean_balises.sub(u' ', text)
    text = re_clean_blanks.sub(u' ', text).strip()
    text = re_clean_multiCR.sub(u'\n', text)
    return text
开发者ID:RouxRC,项目名称:gazouilleur,代码行数:10,代码来源:webmonitor.py


示例7: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
开发者ID:0326,项目名称:scrapy,代码行数:11,代码来源:regex.py


示例8: image_url

def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
开发者ID:scrapy,项目名称:scrapely,代码行数:54,代码来源:extractors.py


示例9: extract_regex

def extract_regex(regex, text, encoding="utf-8"):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group("extract")]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
开发者ID:lopuhin,项目名称:scrapy,代码行数:21,代码来源:misc.py


示例10: extract_regex

def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)
    #flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。
    if isinstance(text, unicode):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
开发者ID:Terrenceyang213,项目名称:SourceLearningNote-Scrapy-,代码行数:21,代码来源:misc_unfinished.py


示例11: parse_item

    def parse_item(self, response):
        links = dict()
        link_titles = set()

        url = response.url.split('#')[0].lower()
        url_head = url.split('/pages/')[0] + '/pages/'

        title = response.xpath('//meta[@name="DC.title"]/@content').extract_first()
        if title and title.endswith('- NHS Choices'):
            title = title.rstrip(' NHS Choices').rstrip(' -')
        subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ')
        subjects = [s.lower() for s in subjects if s]
        if not subjects:
            subjects = [title.lower()]
        description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first())
        raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first()
        page_content = clean_text(replace_entities(remove_tags(raw_page_content)))
        for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'):
            label = a.xpath('text()').extract_first()
            href = a.xpath('@href').extract_first()
            if href and label:
                href = self.base_url + href.lstrip('/')
                href = href.lower()
                label = clean_text(label)
                if '/conditions/' in href and url_head not in href:
                    link_titles.add(label)
                    if href in links:
                        links[href]['count'] += 1
                    else:
                        links[href] = {
                            'count': 1,
                            'label': label
                        }
                if url_head in href and href != url:
                    print("********************", href)
                    yield scrapy.Request(href, self.parse_item)

        article = NhsItem()
        article['url'] = url
        article['title'] = title
        article['subjects'] = subjects
        article['description'] = description
        article['page_content'] = str(page_content)
        article['links'] = links
        article['link_titles'] = list(link_titles)
        yield article
开发者ID:mattkohl,项目名称:nhs-choice-search,代码行数:46,代码来源:conditions.py


示例12: _has_ajaxcrawlable_meta

def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = html.remove_tags_with_content(text, ('script', 'noscript'))
    text = html.replace_entities(text)
    text = html.remove_comments(text)
    return _ajax_crawlable_re.search(text) is not None
开发者ID:01-,项目名称:scrapy,代码行数:24,代码来源:ajaxcrawl.py


示例13: clean_link

import urllib
import urlparse
from urlparse import urljoin
from w3lib.html import replace_entities


def clean_link(link_text):

    return link_text.strip("\t\r\n '\"")

# 返回第一个url地址
list_first_item = lambda x:x[0] if x else None 

# 将url地址组装返回,并移除空格标点 entites
clean_url = lambda base_url, u, response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))


# 获取请求参数
def get_query(url, key):
    bits = list(urlparse.urlparse(url))
    query = urlparse.parse_qs(bits[4])

    return query[key][0]


# 设置请求参数
def set_query(url, **args):
    bits = list(urlparse.urlparse(url))
    query = urlparse.parse_qs(bits[4])
开发者ID:dumengnan,项目名称:ohmydata_spider,代码行数:29,代码来源:select_result.py


示例14: test_returns_unicode

 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(replace_entities(b'no entities'), six.text_type)
     assert isinstance(replace_entities(b'Price: &pound;100!'),  six.text_type)
     assert isinstance(replace_entities(u'no entities'), six.text_type)
     assert isinstance(replace_entities(u'Price: &pound;100!'),  six.text_type)
开发者ID:Preetwinder,项目名称:w3lib,代码行数:6,代码来源:test_html.py


示例15: test_encoding

 def test_encoding(self):
     self.assertEqual(replace_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:3,代码来源:test_html.py


示例16: _cleanup

def _cleanup(value):
    return " ".join(replace_entities(replace_tags(value)).strip().split())
开发者ID:pombredanne,项目名称:fortia,代码行数:2,代码来源:fortia-server.py


示例17: test_browser_hack

 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(replace_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')
     self.assertEqual(replace_entities('x&#x99;y', encoding='cp1252'), u'x\u2122y')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:4,代码来源:test_html.py


示例18: remove_entities

def remove_entities(text, encoding):
    return replace_entities(text, keep=_ENTITIES_TO_KEEP, encoding=encoding)
开发者ID:daqv,项目名称:portia-dashboard,代码行数:2,代码来源:html.py


示例19: test_keep_entities

 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(replace_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(replace_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=[u'lt', u'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:6,代码来源:test_html.py


示例20: type

    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """
    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, replace_entities(text=clean_link(u), encoding=response_encoding)
)
#
# clean_url = lambda base_url, u, response_encoding: urljoin(base_url,
#                                                            replace_entities(
#                                                                text=clean_link(u.decode(response_encoding, 'ignore')),
#                                                                encoding=response_encoding)
# )

"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
开发者ID:DASungta,项目名称:CranberrySearchEngine_Spider,代码行数:30,代码来源:select_result.py



注:本文中的w3lib.html.replace_entities函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python http.basic_auth_header函数代码示例发布时间:2022-05-26
下一篇:
Python html.remove_tags函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap