本文整理汇总了Python中searx.engines.xpath.extract_text函数的典型用法代码示例。如果您正苦于以下问题:Python extract_text函数的具体用法?Python extract_text怎么用?Python extract_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了extract_text函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@class="g"]'):
title = extract_text(result.xpath('.//h3'))
url = result.xpath('.//div[@class="r"]/a/@href')[0]
content = extract_text(result.xpath('.//span[@class="st"]'))
# get thumbnails
script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
id = result.xpath('.//div[@class="s"]//img/@id')[0]
thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
script)
tmp = []
if len(thumbnails_data) != 0:
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
thumbnail = ''
if len(tmp) != 0:
thumbnail = tmp[-1]
# append result
results.append({'url': url,
'title': title,
'content': content,
'thumbnail': thumbnail,
'template': 'videos.html'})
return results
开发者ID:asciimoo,项目名称:searx,代码行数:32,代码来源:google_videos.py
示例2: response
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath(results_xpath):
links = result.xpath(link_xpath)
if not links:
continue
link = links[0]
url = link.attrib.get('href')
# block google-ad url's
if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
continue
title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
else:
content = ''
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results
开发者ID:Acidburn0zzz,项目名称:searx,代码行数:31,代码来源:startpage.py
示例3: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for tweet in dom.xpath(results_xpath):
try:
link = tweet.xpath(link_xpath)[0]
content = extract_text(tweet.xpath(content_xpath)[0])
except Exception:
continue
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(tweet.xpath(title_xpath))
pubdate = tweet.xpath(timestamp_xpath)
if len(pubdate) > 0:
timestamp = float(pubdate[0].attrib.get('data-time'))
publishedDate = datetime.fromtimestamp(timestamp, None)
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate})
else:
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results
开发者ID:davidar,项目名称:searx,代码行数:33,代码来源:twitter.py
示例4: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
regex = re.compile('3\.jpg.*$')
# parse results
for result in dom.xpath('//div[@class="photo"]'):
link = result.xpath('.//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(result.xpath('.//div[@class="title"]'))
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
# To have a bigger thumbnail, uncomment the next line
# thumbnail_src = regex.sub('4.jpg', thumbnail_src)
content = extract_text(result.xpath('.//div[@class="info"]'))
img_src = regex.sub('2048.jpg', thumbnail_src)
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': content,
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results
开发者ID:3615pipou,项目名称:searx,代码行数:27,代码来源:www500px.py
示例5: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]'):
# try to extract the url
url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload')
if len(url_container) > 0:
url = loads(url_container[0])['purl']
else:
url = result.xpath('./a/@href')[0]
# discard results that do not return an external url
# very recent results sometimes don't return the video's url
if url.startswith('/videos/search?'):
continue
title = extract_text(result.xpath('./a//div[@class="tl"]'))
content = extract_text(result.xpath('.//div[@class="pubInfo"]'))
thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0]
results.append({'url': url,
'title': title,
'content': content,
'thumbnail': thumbnail,
'template': 'videos.html'})
# first page ignores requested number of results
if len(results) >= number_of_results:
break
return results
开发者ID:MrLpk,项目名称:searx,代码行数:34,代码来源:bing_videos.py
示例6: response
def response(resp):
results = []
# we get html in a JSON container...
response = loads(resp.text)
if "content" not in response:
return []
dom = html.fromstring(response["content"])
p = HTMLParser()
# parse results
for result in dom.xpath(results_xpath):
videoid = result.xpath(url_xpath)[0]
url = base_url + videoid
title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
if thumbnail[0] == '/':
thumbnail = base_url + thumbnail
d = extract_text(result.xpath(publishedDate_xpath)[0])
d = d.split('/')
# force ISO date to avoid wrong parsing
d = "%s-%s-%s" % (d[2], d[1], d[0])
publishedDate = parser.parse(d)
content = extract_text(result.xpath(content_xpath))
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail})
# return results
return results
开发者ID:JASON0916,项目名称:searx,代码行数:35,代码来源:ina.py
示例7: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0])
except:
continue
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
# if no suggestion found, return results
if not suggestion_xpath:
return results
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
开发者ID:Reventl0v,项目名称:searx,代码行数:31,代码来源:yahoo.py
示例8: response
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results if something is found
if results:
return results
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results
开发者ID:Acidburn0zzz,项目名称:searx,代码行数:35,代码来源:bing.py
示例9: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
videoid = result.xpath('@data-context-item-id')[0]
url = base_youtube_url + videoid
thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath)[0])
embedded = embedded_url.format(videoid=videoid)
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results
开发者ID:Acidburn0zzz,项目名称:searx,代码行数:27,代码来源:youtube_noapi.py
示例10: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
try:
results.append(
{"number_of_results": int(dom.xpath('//span[@class="sb_count"]/text()')[0].split()[0].replace(",", ""))}
)
except:
pass
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath(".//h3/a")[0]
url = link.attrib.get("href")
title = extract_text(link)
content = extract_text(result.xpath(".//p"))
# append result
results.append({"url": url, "title": title, "content": content})
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath(".//h2/a")[0]
url = link.attrib.get("href")
title = extract_text(link)
content = extract_text(result.xpath(".//p"))
# append result
results.append({"url": url, "title": title, "content": content})
# return results
return results
开发者ID:kvch,项目名称:searx,代码行数:34,代码来源:bing.py
示例11: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
title = extract_text(result.xpath(title_xpath)[0])
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
parsed_url = urlparse(url)
if parsed_url.netloc==google_hostname and parsed_url.path==search_path:
# remove the link to google news
continue
if parsed_url.netloc==google_hostname and parsed_url.path==images_path:
# images result
results = results + parse_images(result)
else:
# normal result
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
except:
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
开发者ID:Reventl0v,项目名称:searx,代码行数:35,代码来源:google.py
示例12: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
p = HTMLParser()
# parse results
for result in dom.xpath(results_xpath):
videoid = result.xpath(url_xpath)[0]
url = base_url + videoid
title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
embedded = embedded_url.format(videoid=videoid)
# append result
results.append({'url': url,
'title': title,
'content': '',
'template': 'videos.html',
'publishedDate': publishedDate,
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results
开发者ID:3615pipou,项目名称:searx,代码行数:26,代码来源:vimeo.py
示例13: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
p = HTMLParser()
# parse results
for result in dom.xpath(results_xpath):
url = base_url + result.xpath(url_xpath)[0]
title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(content_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
# append result
results.append(
{
"url": url,
"title": title,
"content": "",
"template": "videos.html",
"publishedDate": publishedDate,
"thumbnail": thumbnail,
}
)
# return results
return results
开发者ID:traceur,项目名称:FrozenSearch,代码行数:28,代码来源:vimeo.py
示例14: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(base_url, link.attrib.get('href'))
# there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
title = escape(extract_text(link))
thumbnail_tags = result.xpath(thumbnail_xpath)
thumbnail = None
if len(thumbnail_tags) > 0:
thumbnail = extract_text(thumbnail_tags[0])
if thumbnail[0] == '/':
thumbnail = base_url + thumbnail
content = escape(extract_text(result.xpath(content_xpath)))
# append result
results.append({'url': href,
'title': title,
'img_src': thumbnail,
'content': content})
# return results
return results
开发者ID:MrLpk,项目名称:searx,代码行数:27,代码来源:framalibre.py
示例15: response
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
for r in doc.xpath(result_xpath):
try:
res_url = r.xpath(url_xpath)[-1]
except:
continue
if not res_url:
continue
title = extract_text(r.xpath(title_xpath))
content = extract_text(r.xpath(content_xpath))
# append result
results.append({'title': title,
'content': content,
'url': res_url})
# return results
return results
开发者ID:harry-wood,项目名称:searx,代码行数:25,代码来源:duckduckgo.py
示例16: response
def response(resp):
dom = html.fromstring(resp.content)
search_res = dom.xpath('.//td[@class="x-item"]')
if not search_res:
return list()
results = list()
for result in search_res:
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
title = result.xpath('.//a[@title]/text()')[0]
content = extract_text(result.xpath('.//div[@class="files"]'))
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
results.append({'url': url,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'template': 'torrent.html'})
return results
开发者ID:cyrilix,项目名称:searx,代码行数:26,代码来源:digbt.py
示例17: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@id="search_res"]/table/tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
content = "<br />".join(content.split("\n"))
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1]
files = result.xpath('.//span[@class="attr_val"]/text()')[1]
seed = result.xpath('.//span[@class="attr_val"]/text()')[2]
# convert seed to int if possible
if seed.isdigit():
seed = int(seed)
else:
seed = 0
leech = 0
# convert filesize to byte if possible
filesize = get_torrent_size(filesize, filesize_multiplier)
# convert files to int if possible
if files.isdigit():
files = int(files)
else:
files = None
magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href']
# append result
results.append({'url': href,
'title': title,
'content': content,
'seed': seed,
'leech': leech,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)
开发者ID:MrLpk,项目名称:searx,代码行数:56,代码来源:btdigg.py
示例18: parse_images
def parse_images(result, google_hostname):
results = []
for image in result.xpath(images_xpath):
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname)
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
# append result
results.append({"url": url, "title": "", "content": "", "img_src": img_src, "template": "images.html"})
return results
开发者ID:kvch,项目名称:searx,代码行数:10,代码来源:google.py
示例19: response
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@id="searchResult"]//tr')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res[1:]:
link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get("href"))
title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath)))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
if seed.isdigit():
seed = int(seed)
else:
seed = 0
# convert leech to int if possible
if leech.isdigit():
leech = int(leech)
else:
leech = 0
magnetlink = result.xpath(magnet_xpath)[0]
torrentfile_links = result.xpath(torrent_xpath)
if torrentfile_links:
torrentfile_link = torrentfile_links[0].attrib.get("href")
else:
torrentfile_link = None
# append result
results.append(
{
"url": href,
"title": title,
"content": content,
"seed": seed,
"leech": leech,
"magnetlink": magnetlink.attrib.get("href"),
"torrentfile": torrentfile_link,
"template": "torrent.html",
}
)
# return results sorted by seeder
return sorted(results, key=itemgetter("seed"), reverse=True)
开发者ID:GreenLunar,项目名称:searx,代码行数:54,代码来源:piratebay.py
示例20: response
def response(resp):
results = []
dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sn_r"]'):
link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
content = escape(extract_text(contentXPath))
# parse publishedDate
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
'//div[contains(@class,"sn_ST")]'
'//span[contains(@class,"sn_tm")]')
publishedDate = escape(extract_text(publishedDateXPath))
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
else:
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
# append result
results.append({'url': url,
'title': title,
'publishedDate': publishedDate,
'content': content})
# return results
return results
开发者ID:Acidburn0zzz,项目名称:searx,代码行数:50,代码来源:bing_news.py
注:本文中的searx.engines.xpath.extract_text函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论