本文整理汇总了Python中requests.utils.get_encodings_from_content函数的典型用法代码示例。如果您正苦于以下问题:Python get_encodings_from_content函数的具体用法?Python get_encodings_from_content怎么用?Python get_encodings_from_content使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_encodings_from_content函数的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_precedence
def test_precedence(self):
content = '''
<?xml version="1.0" encoding="XML"?>
<meta charset="HTML5">
<meta http-equiv="Content-type" content="text/html;charset=HTML4" />
'''.strip()
assert get_encodings_from_content(content) == ['HTML5', 'HTML4', 'XML']
开发者ID:PoNote,项目名称:requests,代码行数:7,代码来源:test_utils.py
示例2: find_encoding
def find_encoding(content, headers=None):
# content is unicode
if isinstance(content, unicode):
return 'unicode'
encoding = None
# Try charset from content-type
if headers:
encoding = get_encoding_from_headers(headers)
if encoding == 'ISO-8859-1':
encoding = None
# Try charset from content
if not encoding:
encoding = get_encodings_from_content(content)
encoding = encoding and encoding[0] or None
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(content)['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
return encoding or 'latin_1'
开发者ID:Crooky,项目名称:qiandao,代码行数:26,代码来源:utils.py
示例3: guess_response_encoding
def guess_response_encoding(resp):
'''
Guess the content encoding of a requests response.
Note: there's a performance issue due to chardet.
'''
# first try the encoding supplied by responce header and content
encs = get_encodings_from_content(resp.content) or []
for enc in encs:
try:
resp.content.decode(enc)
LOG.info('Detected encoding %s from response content.', enc)
return enc
except UnicodeDecodeError:
LOG.debug('Encoding from response content doesn\'t work.')
enc = get_encoding_from_headers(resp.headers)
if enc:
try:
resp.content.decode(enc)
LOG.info('Detected encoding %s from response header.', enc)
return enc
except UnicodeDecodeError:
LOG.debug('Encoding from response header doesn\'t work.')
# neither encoding works, we have to go the hard way.
start = clock()
g = detect(resp.content)
LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
return g['encoding']
开发者ID:dirtysalt,项目名称:dirtysalt.github.io,代码行数:30,代码来源:utils.py
示例4: encoding
def encoding(self):
if hasattr(self, '_encoding'):
return self._encoding
# content is unicode
if isinstance(self.content, unicode):
return 'unicode'
# Try charset from content-type
encoding = get_encoding_from_headers(self.headers)
if encoding == 'ISO-8859-1':
encoding = None
# Try charset from content
if not encoding:
encoding = get_encodings_from_content(self.content)
encoding = encoding and encoding[0] or None
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(self.content)['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
self._encoding = encoding or 'utf-8'
return self._encoding
开发者ID:5aket,项目名称:pyspider,代码行数:27,代码来源:response.py
示例5: encoding
def encoding(rsp):
"""
encoding of Response.content.
if Response.encoding is None, encoding will be guessed
by header or content or chardet if avaibable.
"""
# content is unicode
if isinstance(rsp.content, six.text_type):
return 'unicode'
# Try charset from content-type
encoding = get_encoding_from_headers(rsp.headers)
if encoding == 'ISO-8859-1':
encoding = None
# Try charset from content
if not encoding and get_encodings_from_content:
encoding = get_encodings_from_content(rsp.content)
encoding = encoding and encoding[0] or None
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(rsp.content)['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
encoding = encoding or 'utf-8'
return encoding
开发者ID:zymtech,项目名称:parse_newspage,代码行数:30,代码来源:parserstandalone.py
示例6: _fetchContent
def _fetchContent(self):
r = requests.get(self.url)
if get_encodings_from_content(r.content):
self.encoding = get_encodings_from_content(r.content)[0]
else:
from contextlib import closing
from urllib2 import urlopen
with closing(urlopen(self.url)) as f:
self.encoding = f.info().getparam("charset")
# Set System default Codeing
reload(sys)
sys.setdefaultencoding(self.encoding)
content = r.content.decode(self.encoding)
return content
开发者ID:Lab-317,项目名称:NewsParser,代码行数:18,代码来源:NewsParser.py
示例7: encoding
def encoding(self):
"""
encoding of Response.content.
if Response.encoding is None, encoding will be guessed
by header or content or chardet if available.
"""
if hasattr(self, '_encoding'):
return self._encoding
# content is unicode
if isinstance(self.content, six.text_type):
return 'unicode'
# Try charset from content-type
encoding = get_encoding_from_headers(self.headers)
if encoding == 'ISO-8859-1':
encoding = None
# Try charset from content
if not encoding and get_encodings_from_content:
if six.PY3:
encoding = get_encodings_from_content(utils.pretty_unicode(self.content[:100]))
else:
encoding = get_encodings_from_content(self.content)
encoding = encoding and encoding[0] or None
# Fallback to auto-detected encoding.
if not encoding and chardet is not None:
encoding = chardet.detect(self.content[:600])['encoding']
if encoding and encoding.lower() == 'gb2312':
encoding = 'gb18030'
self._encoding = encoding or 'utf-8'
return self._encoding
开发者ID:01jiagnwei01,项目名称:pyspider,代码行数:36,代码来源:response.py
示例8: procdata_getencoding
def procdata_getencoding(seed,headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
elif code.lower() == 'utf-8':
code = 'utf-8'
else:
code = None
if code == None:
code = utils.get_encodings_from_content(content)
print "content",seed,code
if code:
code = code[0]
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return code
开发者ID:salmonx,项目名称:fengbei,代码行数:20,代码来源:daemon.py
示例9: guess_content_encoding
def guess_content_encoding(content):
'''
Guess the encoding for plain content.
Note: there's a performance issue due to chardet.
'''
# first try the encoding supplied by content
encs = get_encodings_from_content(content) or []
for enc in encs:
try:
content.decode(enc)
LOG.info('Detected encoding %s from content.', enc)
return enc
except UnicodeDecodeError:
LOG.debug('Encoding from content doesn\'t work.')
# neither encoding works, we have to go the hard way.
start = clock()
g = detect(content)
LOG.info('Detected encoding %s with cofidence of %g in %gs.' % (g['encoding'], g['confidence'], clock() - start))
return g['encoding']
开发者ID:dirtysalt,项目名称:dirtysalt.github.io,代码行数:21,代码来源:utils.py
示例10: filter_encoding
def filter_encoding(self,seed, headers,content):
code = utils.get_encoding_from_headers(headers)
if code:
if code.lower() == 'gbk' or code.lower() == 'gb2312':
code = 'gbk'
return True
elif code.lower() == 'utf-8' or code.lower() == 'utf8':
code = 'utf8'
# as for utf8, we should check the content
else: # 'ISO-8859-1' and so on,
code = None
# chinese website may also miss the content-encoding header, so detect the content
if code == None:
codes = utils.get_encodings_from_content(content)
if codes:
for code in codes:
if code.lower() in [ 'gbk','gb2312']:
return True
elif code.lower() == 'utf8' or code.lower() == 'utf-8':
code = 'utf8'
break
if code != 'utf8':
return False
# here handle utf8
# to detect any chinese char win
try:
ucon = content.decode('utf8')
for uchar in ucon:
i = ord(uchar)
if i >= 0x4e00 and i <= 0x9fa5:
return True
except Exception, e:
print url, e
pass
开发者ID:salmonx,项目名称:fengbei,代码行数:38,代码来源:worker_filter.py
示例11: test_pragmas
def test_pragmas(self, content):
encodings = get_encodings_from_content(content)
assert len(encodings) == 1
assert encodings[0] == 'UTF-8'
开发者ID:PoNote,项目名称:requests,代码行数:4,代码来源:test_utils.py
示例12: test_none
def test_none(self):
encodings = get_encodings_from_content('')
assert not len(encodings)
开发者ID:PoNote,项目名称:requests,代码行数:3,代码来源:test_utils.py
示例13: on_incoming
def on_incoming(self, msg):
if not msg.type == msg.CHANNEL:
return
# Catching all exceptions without alerting, as there is just so much crap that can go wrong with web stuff. Also, I'm lazy.
try:
urls = self.url_re.findall(msg.body)
for url in urls:
# Catch edge case where url is in brackets
while url.startswith('(') and url.endswith(')'):
url = url[1:-1]
head = requests.head(url, allow_redirects=True)
# work on the URL we were redirected to, if any
url = head.url
message = ""
content_type = head.headers['content-type']
# HTML websites
if 'text/html' in content_type:
# Set up any required request headers
req_headers = {}
# TODO: Accept-Language header from config
req = requests.get(url, headers=req_headers, timeout=5)
if 'charset' not in content_type:
# requests only looks at headers to detect the encoding, we must find the charset ourselves
# we can't use req.content because regex doesn't work on bytestrings apparently
encodings = get_encodings_from_content(req.text)
if encodings:
req.encoding = encodings[0]
soup = BeautifulSoup(req.text)
# Look for the <title> tag or an <h1>, whichever is first
title = soup.find(['title', 'h1'])
if title is None:
return
title = self.utils.tag_to_string(title)
title = ' '.join(title.split())
message = "Title: " + title
# Other resources
else:
content_length = head.headers.get('content-length', '')
if content_length.isdigit():
size = self.sizeof_fmt(int(content_length))
else:
size = "Unknown size"
# Searches for the last segment of the URL (the filename)
filename = re.search(r'/([^/]+)/?$', url).groups(1)[0]
message = "{}: {} ({})".format(filename, content_type, size)
self.bot.privmsg(msg.channel, message)
except Exception as exception:
print("Link Info Exception!")
print(type(exception), exception)
开发者ID:ackwell,项目名称:ninjabot,代码行数:62,代码来源:linkinfo.py
注:本文中的requests.utils.get_encodings_from_content函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论