本文整理汇总了Python中w3lib.encoding.html_to_unicode函数的典型用法代码示例。如果您正苦于以下问题:Python html_to_unicode函数的具体用法?Python html_to_unicode怎么用?Python html_to_unicode使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了html_to_unicode函数的19个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_replace_wrong_encoding
def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
encoding, body_unicode = html_to_unicode(ct('utf-8'),
'PREFIX\xe3\xabSUFFIX')
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char (u'\ufffd')
assert u'\ufffd' in body_unicode, repr(body_unicode)
assert u'PREFIX' in body_unicode, repr(body_unicode)
assert u'SUFFIX' in body_unicode, repr(body_unicode)
# Do not destroy html tags due to encoding bugs
encoding, body_unicode = html_to_unicode(ct('utf-8'),
'\xf0<span>value</span>')
assert u'<span>value</span>' in body_unicode, repr(body_unicode)
开发者ID:Dior222,项目名称:w3lib,代码行数:14,代码来源:test_encoding.py
示例2: test_gunzip_illegal_eof
def test_gunzip_illegal_eof(self):
with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f:
text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1]
with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o:
expected_text = o.read().decode("utf-8")
self.assertEqual(len(text), len(expected_text))
self.assertEqual(text, expected_text)
开发者ID:lopuhin,项目名称:scrapy,代码行数:7,代码来源:test_utils_gz.py
示例3: extract
def extract(self, html='', **kwargs):
"""
extract data field from raw html or from a url.
"""
if not html and 'url' in kwargs:
info = urlopen(kwargs.pop('url'))
_, html = html_to_unicode(info.headers.get('content_type'), info.read())
builder = DomTreeBuilder(html)
root = builder.build()
region_finder = MiningDataRegion(root, self.k, self.threshold)
regions = region_finder.find_regions(root)
record_finder = MiningDataRecord(self.threshold)
field_finder = MiningDataField()
for region in regions:
records = record_finder.find_records(region)
items, _ = field_finder.align_records(records)
region.items = items
if 'verbose' in kwargs:
print region
for record in records:
print '\t', record
return regions
开发者ID:tpeng,项目名称:pydepta,代码行数:27,代码来源:depta.py
示例4: _assert_encoding
def _assert_encoding(self, content_type, body, expected_encoding,
expected_unicode):
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(norm_encoding(encoding),
norm_encoding(expected_encoding))
self.assertEqual(body_unicode, expected_unicode)
开发者ID:Dior222,项目名称:w3lib,代码行数:7,代码来源:test_encoding.py
示例5: test_unicode_body
def test_unicode_body(self):
unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
original_string = unicode_string.encode('cp1251')
encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
# check body_as_unicode
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(body_unicode, unicode_string)
开发者ID:Dior222,项目名称:w3lib,代码行数:7,代码来源:test_encoding.py
示例6: url_to_page
def url_to_page(url, encoding=None, default_encoding='utf-8'):
"""Fetch a URL, using python urllib2, and return an HtmlPage object.
The `url` may be a string, or a `urllib2.Request` object. The `encoding`
argument can be used to force the interpretation of the page encoding.
Redirects are followed, and the `url` property of the returned HtmlPage object
is the url of the final page redirected to.
If the encoding of the page is known, it can be passed as a keyword argument. If
unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
`default_encoding` is used if the encoding cannot be determined.
"""
fh = urlopen(url)
info = fh.info()
body_str = fh.read()
# guess content encoding if not specified
if encoding is None:
try:
# Python 3.x
content_type_header = fh.getheader("content-type")
except AttributeError:
# Python 2.x
content_type_header = info.getheader("content-type")
encoding, body = html_to_unicode(content_type_header, body_str,
default_encoding=default_encoding)
else:
body = body_str.decode(encoding)
return HtmlPage(fh.geturl(), headers=dict(info.items()), body=body, encoding=encoding)
开发者ID:abudulemusa,项目名称:scrapely,代码行数:29,代码来源:htmlpage.py
示例7: factory
def factory(self,data, parser_cls,url):
charset = 'charset=%s' % 'utf-8'
data = html_to_unicode(charset, data)[1]
body = data.encode('utf8') or '<html/>'
parser = parser_cls(recover=True, encoding='utf8')
return etree.fromstring(body, parser=parser, base_url=url)
开发者ID:ysc8620,项目名称:redant_spider,代码行数:8,代码来源:index.py
示例8: text
def text(self):
""" Body as unicode """
# access self.encoding before _cached_ubody to make sure
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = 'charset=%s' % benc
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
开发者ID:wusy1209,项目名称:scrapy,代码行数:9,代码来源:text.py
示例9: _body_inferred_encoding
def _body_inferred_encoding(self):
if self._cached_benc is None:
content_type = to_native_str(self.headers.get(b'Content-Type', b''))
benc, ubody = html_to_unicode(content_type, self.body,
auto_detect_fun=self._auto_detect_fun,
default_encoding=self._DEFAULT_ENCODING)
self._cached_benc = benc
self._cached_ubody = ubody
return self._cached_benc
开发者ID:wusy1209,项目名称:scrapy,代码行数:9,代码来源:text.py
示例10: body_as_unicode
def body_as_unicode(self):
"""Return body as unicode"""
# check for self.encoding before _cached_ubody just in
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = 'charset=%s' % benc
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
开发者ID:AugustLONG,项目名称:scrapy,代码行数:9,代码来源:text.py
示例11: body_as_unicode
def body_as_unicode(self):
from w3lib.encoding import html_to_unicode, resolve_encoding, \
html_body_declared_encoding, http_content_type_encoding
"""Return body as unicode"""
# check for self.encoding before _cached_ubody just in
# _body_inferred_encoding is called
benc = self.encoding
charset = 'charset=%s' % benc
self._cached_ubody = html_to_unicode(charset, self.content)[1]
return self._cached_ubody
开发者ID:deniyes,项目名称:pyspider,代码行数:10,代码来源:response.py
示例12: response2unicode
def response2unicode(resp):
"""
Convert requests.Response body to unicode.
Unlike ``response.text`` it handles <meta> tags in response content.
"""
enc, html = html_to_unicode(
content_type_header=resp.headers.get("Content-Type"),
html_body_str=resp.content,
auto_detect_fun=_autodetect_encoding,
)
return html
开发者ID:RaoUmer,项目名称:Formasaurus,代码行数:11,代码来源:utils.py
示例13: encoding
def encoding(self) -> str:
"""The encoding string to be used, extracted from the HTML and
:class:`HTMLResponse <HTMLResponse>` headers.
"""
if self._encoding:
return self._encoding
# Scan meta tags for chaset.
if self._html:
self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
return self._encoding if self._encoding else self.default_encoding
开发者ID:666King999,项目名称:requests-html,代码行数:12,代码来源:requests_html.py
示例14: infer
def infer(self, html='', **kwargs):
"""
extract data with seed region and the data you expect to scrape from there.
"""
if 'url' in kwargs:
info = urlopen(kwargs.pop('url'))
_, html = html_to_unicode(info.headers.get('content_type'), info.read())
builder = DomTreeBuilder(html)
doc = builder.build()
page = HtmlPage(body=tostring(doc, encoding=unicode, method='html'))
return self.scraper.scrape_page(page)
开发者ID:tpeng,项目名称:pydepta,代码行数:13,代码来源:depta.py
示例15: _assert_encoding
def _assert_encoding(self, content_type, body, expected_encoding,
expected_unicode):
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(norm_encoding(encoding),
norm_encoding(expected_encoding))
if isinstance(expected_unicode, basestring):
self.assertEqual(body_unicode, expected_unicode)
else:
self.assertTrue(
body_unicode in expected_unicode,
"%s is not in %s" % (body_unicode, expected_unicode)
)
开发者ID:TontonMax,项目名称:w3lib,代码行数:14,代码来源:test_encoding.py
示例16: extract
def extract(self, html="", **kwargs):
"""
extract data regions from raw html or from a url.
"""
if "url" in kwargs:
info = urlopen(kwargs.pop("url"))
_, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read())
builder = DomTreeBuilder(html)
root = builder.build()
mining_region = MiningDataRegion(root, self.k, self.threshold)
regions = mining_region.find_regions(root)
mining_record = MiningDataRecord()
mining_field = MiningDataField()
region_records = {}
all_items = []
for i, region in enumerate(regions):
records = mining_record.find_records(region)
items, _ = mining_field.align_records(records)
all_items.extend(items)
assert len(items) == len(records)
region_records.update({region: records})
if "verbose" in kwargs:
print region
for record in records:
print "\t", record
# always annotate at last to avoid modify the DOM tree
if "annotate" in kwargs:
for i, region in enumerate(regions):
for j, record in enumerate(region_records.get(region)):
self.annotate(i, j, record.elements)
with open(kwargs.pop("annotate"), "w") as f:
print >> f, tostring(root, pretty_print=True)
return all_items
开发者ID:netconstructor,项目名称:pydepta,代码行数:40,代码来源:depta.py
示例17: HTMLParser
safe_attrs_only=False
)
parser = HTMLParser(encoding=encoding)
html = lxml.html.document_fromstring(html, parser=parser)
doc = cleaner.clean_html(html)
return lxml.etree.tounicode(doc)
def mkdir(path):
try:
os.makedirs(path)
except OSError:
pass
if __name__ == '__main__':
args = docopt(__doc__)
mkdir(args['--out'])
for in_name in args['<input>']:
path, fname = os.path.split(in_name)
out_name = os.path.join(args['--out'], fname)
with open(in_name, 'rb') as f:
encoding, html = html_to_unicode(None, f.read())
cleaned = clean_html(html.encode(encoding), encoding)
with codecs.open(out_name, 'w', encoding='utf8') as out:
out.write(cleaned)
开发者ID:CyberIntelMafia,项目名称:webstruct,代码行数:30,代码来源:clean_html.py
示例18: open
with open(kwargs.pop("annotate"), "w") as f:
print >> f, tostring(root, pretty_print=True)
return all_items
def annotate(self, region, record, elements):
"""
annotate the HTML elements with PyQuery.
"""
colors = ["#ffff42", "#ff0000", "#00ff00", "#ff00ff"]
p = pq(elements[0])
div = p.wrap(
'<div class="mdr_region" region_id={} record_id={} style="color:{}; border:solid 5px"></div>'.format(
region, record, choice(colors)
)
)
for e in elements[1:]:
div.append(e)
if __name__ == "__main__":
import sys
info = urlopen(sys.argv[1])
_, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read())
depta = Depta()
items = depta.extract(html, annotate="output.html", verbose=True)
for i, item in enumerate(items):
print i, " | ".join(map(lambda x: x.text, item.fields))
开发者ID:netconstructor,项目名称:pydepta,代码行数:30,代码来源:depta.py
示例19: _assert_encoding_detected
def _assert_encoding_detected(self, content_type, expected_encoding, body,
**kwargs):
assert not isinstance(body, six.text_type)
encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
self.assertTrue(isinstance(body_unicode, six.text_type))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
开发者ID:azizur77,项目名称:w3lib,代码行数:6,代码来源:test_encoding.py
注:本文中的w3lib.encoding.html_to_unicode函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论