本文整理汇总了Python中six.moves.urllib.parse.urldefrag函数的典型用法代码示例。如果您正苦于以下问题:Python urldefrag函数的具体用法?Python urldefrag怎么用?Python urldefrag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urldefrag函数的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: startElementNS
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
开发者ID:drewp,项目名称:rdflib,代码行数:30,代码来源:rdfxml.py
示例2: get_disk_name
def get_disk_name(ovf):
"""Get the disk format and file name from a OVF descriptor."""
root = etree.fromstring(ovf)
ovf_ns = root.nsmap['ovf']
id_attr = '{%s}id' % ovf_ns
href_attr = '{%s}href' % ovf_ns
files = {f.get(id_attr): f.get(href_attr) for f in
root.findall('ovf:References/ovf:File', root.nsmap)}
# we do not care about more than one disk
disk = root.find('ovf:DiskSection/ovf:Disk', root.nsmap)
if disk is not None:
format_attr = '{%s}format' % ovf_ns
fileref_attr = '{%s}fileRef' % ovf_ns
ovf_format = disk.get(format_attr)
if not ovf_format:
raise Exception("Expecting some format!")
(format_url, _) = parse.urldefrag(ovf_format)
try:
disk_format = SPECS[format_url]
except KeyError:
raise Exception("Unknown format!")
try:
disk_file = files[disk.get(fileref_attr)]
except KeyError:
raise Exception("Unknown disk!")
return (disk_format, disk_file)
return None, None
开发者ID:alvarolopez,项目名称:atrope,代码行数:29,代码来源:ovf.py
示例3: __init__
def __init__(self, request, timeout=180):
self._url = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url = to_bytes(self._url, encoding='ascii')
self.method = to_bytes(request.method, encoding='ascii')
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = request.meta.get('download_timeout') or timeout
self.start_time = time()
self.deferred = defer.Deferred().addCallback(self._build_response, request)
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
# to have _disconnectedDeferred. See Twisted r32329.
# As Scrapy implements it's own logic to handle redirects is not
# needed to add the callback _waitForDisconnect.
# Specifically this avoids the AttributeError exception when
# clientConnectionFailed method is called.
self._disconnectedDeferred = defer.Deferred()
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault('Host', self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers['Content-Length'] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
# Content-Length must be specified in POST method even with no body
elif self.method == b'POST':
self.headers['Content-Length'] = 0
开发者ID:390218462,项目名称:scrapy,代码行数:33,代码来源:webclient.py
示例4: escape_ajax
def escape_ajax(url):
"""
Return the crawleable url according to:
http://code.google.com/web/ajaxcrawling/docs/getting-started.html
>>> escape_ajax("www.example.com/ajax.html#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html#!")
'www.example.com/ajax.html?_escaped_fragment_='
URLs that are not "AJAX crawlable" (according to Google) returned as-is:
>>> escape_ajax("www.example.com/ajax.html#key=value")
'www.example.com/ajax.html#key=value'
>>> escape_ajax("www.example.com/ajax.html#")
'www.example.com/ajax.html#'
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
defrag, frag = urldefrag(url)
if not frag.startswith("!"):
return url
return add_or_replace_parameter(defrag, "_escaped_fragment_", frag[1:])
开发者ID:naisanza,项目名称:scrapy,代码行数:27,代码来源:url.py
示例5: absolutize
def absolutize(self, uri, defrag=1):
base = urljoin("file:", pathname2url(os.getcwd()))
result = urljoin("%s/" % base, uri, allow_fragments=not defrag)
if defrag:
result = urldefrag(result)[0]
if not defrag:
if uri and uri[-1] == "#" and result[-1] != "#":
result = "%s#" % result
return URIRef(result)
开发者ID:hsolbrig,项目名称:rdflib,代码行数:9,代码来源:namespace.py
示例6: url_query_cleaner
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
>>> import w3lib.url
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
'product.html?id=200'
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
'product.html?id=200&name=wired'
>>>
If `unique` is ``False``, do not remove duplicated keys
>>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
'product.html?d=1&d=2&d=3'
>>>
If `remove` is ``True``, leave only those **not in parameterlist**.
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
'product.html?foo=bar&name=wired'
>>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
'product.html?name=wired'
>>>
By default, URL fragments are removed. If you need to preserve fragments,
pass the ``keep_fragments`` argument as ``True``.
>>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
'http://domain.tld/#123123'
"""
if isinstance(parameterlist, (six.text_type, bytes)):
parameterlist = [parameterlist]
url, fragment = urldefrag(url)
base, _, query = url.partition('?')
seen = set()
querylist = []
for ksv in query.split(sep):
if not ksv:
continue
k, _, _ = ksv.partition(kvsep)
if unique and k in seen:
continue
elif remove and k in parameterlist:
continue
elif not remove and k not in parameterlist:
continue
else:
querylist.append(ksv)
seen.add(k)
url = '?'.join([base, sep.join(querylist)]) if querylist else base
if keep_fragments:
url += '#' + fragment
return url
开发者ID:scrapy,项目名称:w3lib,代码行数:55,代码来源:url.py
示例7: update_params
def update_params(_url, _debug=False, **params):
"""Update the query parameters in a URL.
``_url`` is any URL, with or without a query string.
``**params`` are query parameters to add or replace. Each value may be a
string, a list of strings, or None. Passing a list generates multiple
values for the same parameter. Passing None deletes the corresponding
parameter if present.
Return the new URL.
*Debug mode:* if ``_debug=True``, return a tuple:
``[0]`` is the URL without query string or fragment,
``[1]`` is the final query parameters as a dict, and
``[2]`` is the fragment part of the original URL or the empty string.
Usage:
>>> update_params("foo", new1="NEW1")
'foo?new1=NEW1'
>>> update_params("foo?p=1", p="2")
'foo?p=2'
>>> update_params("foo?p=1", p=None)
'foo'
>>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1")
'http://example.com/foo?new1=NEW1#myfrag'
>>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1", _debug=True)
('http://example.com/foo', {'new1': 'NEW1'}, 'myfrag')
>>> update_params("http://www.mau.de?foo=2", brrr=3)
'http://www.mau.de?foo=2&brrr=3'
>>> update_params("http://www.mau.de?foo=A&foo=B", foo=["C", "D"])
'http://www.mau.de?foo=C&foo=D'
"""
url, fragment = urldefrag(_url)
if "?" in url:
url, qs = url.split("?", 1)
query = parse_qs(qs)
else:
query = {}
for key in params:
value = params[key]
if value is not None:
query[key] = value
elif key in query:
del query[key]
if _debug:
return url, query, fragment
qs = urlencode(query, True)
if qs:
qs = "?" + qs
if fragment:
fragment = "#" + fragment
return "{0}{1}{2}".format(url, qs, fragment)
开发者ID:jManji,项目名称:Trivia,代码行数:55,代码来源:tools.py
示例8: resolve_ref
def resolve_ref(self, obj, base_url):
ref = obj.pop('import', None)
url = urlparse.urljoin(base_url, ref)
if url in self.resolved:
return self.resolved[url]
if url in self.resolving:
raise RuntimeError('Circular reference for url %s' % url)
self.resolving[url] = True
doc_url, pointer = urlparse.urldefrag(url)
document = self.fetch(doc_url)
fragment = copy.deepcopy(resolve_pointer(document, pointer))
try:
result = self.resolve_all(fragment, doc_url)
finally:
del self.resolving[url]
return result
开发者ID:lowks,项目名称:rabix,代码行数:17,代码来源:ref_resolver.py
示例9: url_query_cleaner
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True):
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
>>> import w3lib.url
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
'product.html?id=200'
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
'product.html?id=200&name=wired'
>>>
If `unique` is ``False``, do not remove duplicated keys
>>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
'product.html?d=1&d=2&d=3'
>>>
If `remove` is ``True``, leave only those **not in parameterlist**.
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
'product.html?foo=bar&name=wired'
>>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
'product.html?name=wired'
>>>
"""
if isinstance(parameterlist, (six.text_type, bytes)):
parameterlist = [parameterlist]
url = urldefrag(url)[0]
base, _, query = url.partition('?')
seen = set()
querylist = []
for ksv in query.split(sep):
k, _, _ = ksv.partition(kvsep)
if unique and k in seen:
continue
elif remove and k in parameterlist:
continue
elif not remove and k not in parameterlist:
continue
else:
querylist.append(ksv)
seen.add(k)
return '?'.join([base, sep.join(querylist)]) if querylist else base
开发者ID:Preetwinder,项目名称:w3lib,代码行数:44,代码来源:url.py
示例10: resolve_ref
def resolve_ref(self, obj, base_url):
ref, mixin, checksum = (obj.pop("$ref", None), obj.pop("$mixin", None), obj.pop("$checksum", None))
ref = ref or mixin
url = urlparse.urljoin(base_url, ref)
if url in self.resolved:
return self.resolved[url]
if url in self.resolving:
raise RuntimeError("Circular reference for url %s" % url)
self.resolving[url] = True
doc_url, pointer = urlparse.urldefrag(url)
document = self.fetch(doc_url)
fragment = copy.deepcopy(resolve_pointer(document, pointer))
try:
self.verify_checksum(checksum, fragment)
if isinstance(fragment, dict) and mixin:
fragment = dict(obj, **fragment)
result = self.resolve_all(fragment, doc_url)
finally:
del self.resolving[url]
return result
开发者ID:RitwikGupta,项目名称:rabix,代码行数:20,代码来源:ref_resolver.py
示例11: download_request
def download_request(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
url = urldefrag(request.url)[0]
method = request.method
headers = TxHeaders(request.headers)
bodyproducer = _RequestBodyProducer(request.body) if request.body else None
start_time = time()
d = agent.request(method, url, headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d
开发者ID:BillWangCS,项目名称:scrapy,代码行数:21,代码来源:http11.py
示例12: download_request
def download_request(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
elif method == b'POST':
# Setting Content-Length: 0 even for POST requests is not a
# MUST per HTTP RFCs, but it's common behavior, and some
# servers require this, otherwise returning HTTP 411 Length required
#
# RFC 7230#section-3.3.2:
# "a Content-Length header field is normally sent in a POST
# request even when the value is 0 (indicating an empty payload body)."
#
# Twisted < 17 will not add "Content-Length: 0" by itself;
# Twisted >= 17 fixes this;
# Using a producer with an empty-string sends `0` as Content-Length
# for all versions of Twisted.
bodyproducer = _RequestBodyProducer(b'')
else:
bodyproducer = None
start_time = time()
d = agent.request(
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d
开发者ID:JohnDoes95,项目名称:project_parser,代码行数:40,代码来源:http11.py
示例13: download_request
def download_request(self, request):
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
bodyproducer = _RequestBodyProducer(request.body) if request.body else None
start_time = time()
d = agent.request(
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d
开发者ID:bedreamer,项目名称:scrapy,代码行数:24,代码来源:http11.py
示例14: resolve_ref
def resolve_ref(self, obj, base_url):
ref = obj.pop('import', None)
txt = obj.pop('include', None)
parse = txt is None
url = urlparse.urljoin(base_url, ref or txt)
if url in self.resolved:
return self.resolved[url]
if url in self.resolving:
raise RuntimeError('Circular reference for url %s' % url)
self.resolving[url] = True
doc_url, pointer = urlparse.urldefrag(url)
try:
document = self.fetch(doc_url, parse)
if parse:
fragment = (copy.deepcopy(self.index.get("#" + pointer))
or resolve_pointer(document, pointer))
result = self.resolve_all(fragment, doc_url)
else:
result = document
finally:
del self.resolving[url]
return result
开发者ID:dionjwa,项目名称:rabix,代码行数:24,代码来源:ref_resolver.py
示例15: escape_ajax
def escape_ajax(url):
"""
Return the crawleable url according to:
http://code.google.com/web/ajaxcrawling/docs/getting-started.html
>>> escape_ajax("www.example.com/ajax.html#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html#!")
'www.example.com/ajax.html?_escaped_fragment_='
URLs that are not "AJAX crawlable" (according to Google) returned as-is:
>>> escape_ajax("www.example.com/ajax.html#key=value")
'www.example.com/ajax.html#key=value'
>>> escape_ajax("www.example.com/ajax.html#")
'www.example.com/ajax.html#'
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
#>>>urlparse('http://www.example.com/ajax.html?k1=v1&k2=v2#!key=value')
#ParseResult(scheme='http', netloc='www.example.com', path='/ajax.html'
# , params='', query='k1=v1&k2=v2', fragment='!key=value')
defrag, frag = urldefrag(url)
#这个函数就是将fragment单独抽取出来的。以上面的url为例
#('http://www.example.com/ajax.html?k1=v1&k2=v2', '!key=value')
#如果不是以!开头就直接返回。
#如果是则对url进行操作。
if not frag.startswith('!'):
return url
return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])#[1:]去感叹号。
开发者ID:Terrenceyang213,项目名称:SourceLearningNote-Scrapy-,代码行数:36,代码来源:url.py
示例16: raw_process_reference
def raw_process_reference(self, path):
uri = "file://" + os.path.abspath(path)
fileuri, _ = urldefrag(uri)
return RawProcessReference(self.raw_document_loader.fetch(fileuri), uri)
开发者ID:AAFC-MBB,项目名称:galaxy-1,代码行数:4,代码来源:schema.py
示例17: _get_versioned_url
def _get_versioned_url(full_url, version):
parsed_url, _ = parse.urldefrag(full_url)
if version[-1] != '/':
version += '/'
return parse.urljoin(parsed_url, version)
开发者ID:great2soul,项目名称:barbican,代码行数:6,代码来源:versions.py
注:本文中的six.moves.urllib.parse.urldefrag函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论