本文整理汇总了Python中utils.get_domain函数的典型用法代码示例。如果您正苦于以下问题:Python get_domain函数的具体用法?Python get_domain怎么用?Python get_domain使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_domain函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_source_node
def get_source_node(self):
if 'source_selector' in self.options:
if self.options['source_selector']:
nodes = self.doc.cssselect(self.options['source_selector'])
if len(nodes) == 1:
return nodes[0]
for node in nodes:
res = self.has_source(node)
if res is not None:
return res
body = self.doc.find('body')
if body is None:
return None
for node in body.iter():
res = self.has_source(node)
if res is not None:
return res
domain = get_domain(self.url)
for a in self.doc.iter('a'):
link = a.get('href')
if link and link.startswith('http') \
and get_domain(link) != domain:
text = self.get_block_text(a)
if len(text) > 2 \
and text.endswith(u'报') \
and not text.endswith(u'举报'):
return a
开发者ID:dotajin,项目名称:haoku-open,代码行数:30,代码来源:article.py
示例2: data_needs
def data_needs(request, template="needs.html"):
themes = Theme.objects.all().order_by("display_name")
ordered_themes, theme_dict = add_ordered_needs_lists(themes)
context = {
"themes": themes,
"theme_dict": theme_dict,
"ordered_themes": ordered_themes,
"domain": get_domain(8000),
"domain8010": get_domain(),
}
return render_to_response(template, RequestContext(request, context))
开发者ID:WCGA,项目名称:marine-planner-wcodp,代码行数:11,代码来源:views.py
示例3: csw_listing
def csw_listing(request, template='pycsw_catalog_view.html'):
if logger:
logger.info("Start csw_listing")
csw_recs = pycsw_records.objects.using('pycsw_test').all().order_by('organization')
html_id = 0
for rec in csw_recs:
rec.html_id = html_id
html_id += 1
context = {'records': csw_recs, 'domain': get_domain(8000), 'domain8010': get_domain()}
if logger:
logger.info("End csw_listing")
return render_to_response(template, RequestContext(request, context))
开发者ID:DanRamage,项目名称:secoora-portal,代码行数:12,代码来源:views.py
示例4: add
def add(self, cate):
url = cate['url']
domain = get_domain(url)
subdomains = get_subdomains(url)
paths = get_path(url).split('/')
query = urlparse.urlparse(url).query
if domain not in self.root:
self.root[domain] = {'sub':{}, 'path':{}}
node = self.root[domain]
if len(subdomains) > 1 or len(subdomains) == 1 and subdomains[0] != 'www':
for sub in subdomains:
if sub not in node['sub']:
node['sub'][sub] = {'sub':{}, 'path':{}}
node = node['sub'][sub]
for path in paths:
if path not in node['path']:
node['path'][path] = {'path':{}}
node = node['path'][path]
if query:
node['path']['query___' + query] = {'path':{}}
node = node['path']['query___' + query]
node['cate'] = cate
开发者ID:dotajin,项目名称:haoku-open,代码行数:28,代码来源:best2spider.py
示例5: bookmark_link
def bookmark_link(self):
if not self.bookmark and self.is_sublayer and self.parent.bookmark:
return self.parent.bookmark.replace('<layer_id>', str(self.id))
if not self.bookmark:
domain = get_domain(8000)
return '%s/planner/#%s' %(domain, self.slug)
return self.bookmark
开发者ID:Ecotrust,项目名称:marco-portal,代码行数:7,代码来源:models.py
示例6: __init__
def __init__(self, link, base_url):
self.text = self.get_text(link)
self.class_ = self.get_class(link)
self.href = self.get_href(link, base_url)
self.domain = get_domain(self.href)
self.parent = link.parent
self.base_url = base_url
开发者ID:nlpwhu,项目名称:info-source,代码行数:7,代码来源:website.py
示例7: introspect
def introspect(domain):
filter_func = lambda x: get_domain(loads(x[1])).lower() in domain.lower()
pages, requested_page = get_effective_page(request.args.get("page", 0),
filter_func)
items = get_items(filter_func, g.db_file, requested_page)
return render_template("index.html", items=items, pages=pages,
requested_page=requested_page, current_page=request.args.get('page', 0))
开发者ID:makoConstruct,项目名称:merveilles_io,代码行数:8,代码来源:routes.py
示例8: is_image_link
def is_image_link(url):
if url.split('.')[-1] in img_extensions:
return True
domain = get_domain(url).split('.')
for sharer in img_sharers:
if sharer in domain:
return True
return False
开发者ID:morganecf,项目名称:topic-modeling,代码行数:8,代码来源:subreddit_stats.py
示例9: add_learn_links
def add_learn_links(themes):
context = []
domain = get_domain()
for theme in themes:
link = '%s/portal/learn/%s' %(domain, linkify(theme.name))
#print link
context.append({'theme': theme, 'learn_link': link})
return context
开发者ID:atrawog,项目名称:marco-portal,代码行数:8,代码来源:views.py
示例10: get_allowed_from
def get_allowed_from(self, child_urls):
"""
:param child_urls: List of child urls to check robots.txt on
:return: A list of allowed child urls to crawl
"""
allowed = []
domains = list(set('{0}'.format(get_domain(url)) for url in child_urls))
domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains}
for domain in domain_to_children:
try:
rules = self.robots.fetch(domain)
for url in domain_to_children[domain]:
if rules.allowed(url, self._agent):
allowed.append(url)
except:
allowed.extend(domain_to_children[domain])
return allowed
开发者ID:netarachelhershko,项目名称:crawler,代码行数:17,代码来源:robots_validator.py
示例11: top_things
def top_things(db_file):
urls = {}
people = {}
graph = {}
db = DB()
if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
print "Could not open database. (Top things)"
cur = db.cursor()
cur.jump_back()
while True:
rec = cur.get(False)
if not rec:
break
loaded_rec = loads(rec[1])
split = get_domain(loaded_rec)
if urls.get(split, False) == False:
urls[split] = 1
else:
urls[split] = urls[split] + 1
person = loaded_rec['person']
if people.get(person, False) == False:
people[person] = 1
else:
people[person] = people[person] + 1
if split is not None and split is not "" and \
person is not None and person is not "":
# Build a crazy relational graph out of my nosql data
if graph.get(split, False) == False:
graph[split] = {"is_person": False, "data": [person], "linked_to_count": 1}
elif person not in graph[split]:
graph[split]["data"].append(person)
graph[split]["linked_to_count"] = graph[split]["linked_to_count"] + 1
if graph.get(person, False) == False:
graph[person] = {"is_person": True, "data": [split]}
elif split not in graph[person]:
graph[person]["data"].append(split)
cur.step_back()
cur.disable()
db.close()
def get_one(x):
return x[1]
return (sorted(urls.items(), key=get_one, reverse=True),
sorted(people.items(), key=get_one, reverse=True),
graph)
开发者ID:lykkin,项目名称:merveilles_io,代码行数:55,代码来源:database.py
示例12: __init__
def __init__(self, input, **options):
self.input = input
self.url = options.get('url', '')
self.debug = options.get('debug', False)
self.title = options.get('title', '^^')
self.pages = options.get('pages', None)
self.texts = options.get('texts', None)
self.domain = get_domain(self.url)
self.options = options
self.doc = clean_html(input, return_doc=True)
self.text = self.doc.text_content()
self.len = word_count(self.text) if self.text else 0
开发者ID:dotajin,项目名称:haoku-open,代码行数:12,代码来源:clean+(copy).py
示例13: fetch_from
def fetch_from(self, urls):
"""
:param urls: A list of urls to fetch sitemaps of
:return: A list of urls that was found within each sitemap of given urls
"""
unique_domains = list(set(get_domain(u) for u in urls))
sitemaps = self._try_fetch_sitemaps(unique_domains)
results = []
for url in sitemaps:
sitemaps_content = self.requests_getter.get_content_from(sitemaps[url])
for content in sitemaps_content:
locations = self.sitemap_url_extractor.extract_from(content)
locations = filter(lambda u: not u.endswith('.xml'), locations)
results.extend(locations)
return results
开发者ID:netarachelhershko,项目名称:crawler,代码行数:15,代码来源:sitemap_fetcher.py
示例14: fetch_stories
def fetch_stories(self, correlation_id=-1):
"""Fetches new stories from the datasource. Uses the last story external id to
fetch only new stories."""
try:
url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password)
tweets = urllib.urlopen(url).read()
tweets = json.loads(tweets)
print tweets
for key in tweets:
try :
authors = []
authors.append(tweets[key])
self.add_read_story(key, authors)
self.add_user(tweets[key])
except:
log_event("fetch_stories_failed", "AgentCell", self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id)
except:
log_event("fetch_stories_failed", "AgentCell", self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
开发者ID:ofri,项目名称:EventHorizon,代码行数:18,代码来源:models.py
示例15: article
def article():
url = request.args.get('url')
article = mongo.article.find_one({'_id':url})
if not article:
try:
html = get_or_cache(url)
article = html2article(html, url, selector=True, merge=True)
if article and not article['src_name']:
article['src_name'] = get_domain(url)
tpl = url2tpl(url)
urls = html2urls(html, url)
texts = dict(map(lambda x: (x[0], max(x[1], key=lambda y:len(y))), urls.iteritems()))
tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys()))
urls = {}
for u, t in tmp.iteritems():
if u != url and t == tpl:
urls[u] = texts[u]
if len(urls) >= 10:
break
if article:
article['urls'] = urls
article['_id'] = url
article['view'] = 1
article['last'] = time.time()
copy = article.copy()
copy['urls'] = json.dumps(copy['urls'])
mongo.article.save(copy)
except:
pass
else:
article['urls'] = json.loads(article['urls'])
mongo.article.update({'_id':url}, {'$set':{'view':article['view'] + 1}})
if article:
article['pubtime'] = article['pubtime'][:10]
return render_template('extract/article.html', article=article, url=url)
开发者ID:dotajin,项目名称:haoku-open,代码行数:43,代码来源:views.py
示例16: crawl
def crawl(self, url, max_page_depth=5, max_external_sites_page_depth=4, request_rate_limit=4):
"""
Will crawl a given url up to max_page_depth and max_external_sites_page_depth on a max rate of
request_rate_limit.
:param url: The to-be crawled url
:param max_page_depth: Max internal (same-domain) depth
:param max_external_sites_page_depth: Max external (different-domain) depth
:param request_rate_limit: Up to n requests at once
:return: List of Url objects (See schemas/url.py)
"""
self._url_scanner.set_request_limit(request_rate_limit)
self._max_page_depth = max_page_depth
self._max_external_sites_page_depth = max_external_sites_page_depth
self._domain = get_domain(url)
self._internal_urls_to_scan.append(url)
self._crawl_internal_urls()
self._crawl_external_urls()
return self._get_crawled_urls()
开发者ID:netarachelhershko,项目名称:crawler,代码行数:19,代码来源:crawler.py
示例17: extract_essence
def extract_essence(self, correlation_id):
"""
Analyze the story text, to extract the essence from it. For the essence, look for a matching StoryEssence cell.
If found, link the story cell to the StoryEssence cell. Else create a new StoryEssence cell & link the story to it.
"""
try:
print "extract_essence called for story '%s'" % self.core
client = Client()
response = client.get('http://%s/text_analyzer/extract_essence/' % get_domain(), {'text': self.core}).content
print "got extract essence response: ", response
if response != "":
try :
self.add_essence(response)
except:
print sys.exc_info()
print "essence=", response
log_event("extract_essence_failed", STORY_CELL, self.id, "Adding essence '%s' extracted from story '%s' failed" % (response, self.core), correlation_id)
# all went all, update the flag
self.is_essence_extracted = True
self.save()
except:
print "Failed to extract essence", sys.exc_info()
log_event("extract_essence_failed", STORY_CELL, self.id, "Failed to extract essence from story '%s'" % self.core, correlation_id)
开发者ID:dibaunaumh,项目名称:EventHorizon,代码行数:23,代码来源:models.py
示例18: fetch_stories
def fetch_stories(self, correlation_id=-1):
"""Fetches new stories from the datasource. Uses the last story external id to
fetch only new stories."""
try:
#url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password)
#tweets = urllib.urlopen(url).read()
client = Client()
tweets = client.get('http://%s/twitter_sensor/' % get_domain(), {'user': self.user.user_name, 'password': self.user.user_password}).content
tweets = json.loads(tweets)
print tweets
for key in tweets:
try :
authors = []
for story in StoryCell.objects.all():
if story.core == key:
return
authors.append(tweets[key][0])
self.add_read_story(key, authors)
self.add_user(tweets[key][0])
except:
log_event("fetch_stories_failed", AGENT_CELL, self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id)
except:
print "Failed to fetch stories", sys.exc_info()
log_event("fetch_stories_failed", AGENT_CELL, self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
开发者ID:dibaunaumh,项目名称:EventHorizon,代码行数:24,代码来源:models.py
示例19: get_absolute_url
def get_absolute_url(self):
return "http://%s/cells/view/story/%d" % (get_domain(), self.id)
开发者ID:dibaunaumh,项目名称:EventHorizon,代码行数:2,代码来源:models.py
示例20: filter_func
def filter_func(x):
return get_domain(loads(x[1])).lower() in domain.lower()
开发者ID:lykkin,项目名称:merveilles_io,代码行数:2,代码来源:routes.py
注:本文中的utils.get_domain函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论