本文整理汇总了Python中spider.Spider类的典型用法代码示例。如果您正苦于以下问题:Python Spider类的具体用法?Python Spider怎么用?Python Spider使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Spider类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
""" 程序主入口
获取命令行参数并做判断和处理,根据参数设置logger,创建线程池和 spider,线程池中加入
工作线程 处理线程任务,spider向线程池中加入任务。
"""
# 获取命令行参数并处理
args = base.get_arg()
if not base.check_args(args):
print 'Args error!'
sys.exit()
base.handle_args(args)
# 设置logger
if not base.set_logger(args.log_file, args.log_level):
print 'Set logger error'
sys.exit()
logger.debug('Get args :%s' % args)
# 程序自检
if args.test_self:
base.test_self()
sys.exit()
database = Sqlite3DB(args.db_file)
# 创建 spider 和 线程池。根据 thread_num 向线程池加入多个工作线程。
# 在 spider 中建立多个任务 放入到线程池中。
spider = Spider(args.url, args.depth, args.thread_num, args.key_word,
args.down_file, database)
main_thread = MainThread(spider)
main_thread.start()
spider.start()
开发者ID:micheal-xudb,项目名称:py-spider,代码行数:32,代码来源:main.py
示例2: work
def work():
while True:
item = thread_queue.get()
url = item['url']
distance = item['distance']
Spider.crawl_page(threading.current_thread().name, url, distance)
thread_queue.task_done()
开发者ID:nenad1001,项目名称:CrazyS,代码行数:7,代码来源:main.py
示例3: walk
def walk(self, url, outfile):
self.pageinfo = {}
self.errors = []
Spider.walk(self, url, self.iswebpage)
print("\r[ ] Processed %i urls" % (len(self.pageinfo)))
urlset = ET.Element('urlset', {'xmlns':"http://www.sitemaps.org/schemas/sitemap/0.9"})
for page in self.pageinfo:
url = ET.SubElement(urlset, 'url')
loc = ET.SubElement(url, 'loc')
lastmod = ET.SubElement(url, 'lastmod')
changefreq = ET.SubElement(url, 'changefreq')
priority = ET.SubElement(url, 'priority')
loc.text = page
lastmod.text = self.pageinfo[page]['lastmod']
changefreq.text = self.pageinfo[page]['change']
priority.text = '%0.1f' % self.pageinfo[page]['pri']
tree = ET.ElementTree(urlset)
tree.write(outfile, encoding='utf-8', xml_declaration=True)
if len(self.errors) > 0:
print("[!] The following pages produced errors:")
for e in self.errors:
print(" %i %s" % (e[1], e[0]))
开发者ID:nada-labs,项目名称:sitemap-generator,代码行数:28,代码来源:sitemap.py
示例4: work
def work():
print('main.py/work()')
while True:
url=queue.get()
Spider.crawl_page(threading.current_thread().name,url)
queue.task_done()
print('main.py/work()/end')
开发者ID:AllenDrake2016,项目名称:Readme,代码行数:7,代码来源:main.py
示例5: downloadArchivesList
def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5):
'''Set up downloader'''
queue = initDownloader(numThreads)
import csv
f = open(aList, 'rb')
reader = csv.reader(f)
for row in reader:
startURL = row[0]
mlName = startURL.split('/')[-2]
spider = Spider(startURL)
spider.process_page(startURL)
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
if len(urlList):
print '%s: %d archives' % (mlName, len(urlList))
store = os.path.join(container, mlName)
if not (os.path.isdir(store)):
os.system("mkdir %s" % store)
'''Download each archive'''
addToQ(queue, urlList, store)
'''If here, download finished. Stop threads'''
stopDownloader(queue, numThreads)
开发者ID:cupescapone,项目名称:miningTools,代码行数:29,代码来源:mlArchivesDownloader.py
示例6: __init__
def __init__(self, **kwargs):
kwargs['enable_reborn'] = True
kwargs['enable_proxy'] = False
kwargs['max_login_tries'] = 8
#kwargs['ips_obj'] = self.ips_obj
self.out = open('out.txt', 'w+')
self.login_status = False
Spider.__init__(self, **kwargs)
开发者ID:vv1133,项目名称:spider_engine,代码行数:8,代码来源:example_brand.py
示例7: run
def run(self):
sp = Spider()
if not sp.login_succeed:
self.stop()
else:
while True:
new_stuff = sp.update()
if len(new_stuff) > 0:
print str(len(new_stuff)) + " weibos to update"
glob.newswall.notifyCallbacks(new_stuff)
time.sleep(crawl_interval)
开发者ID:brotherb,项目名称:weibo_sync_wall,代码行数:11,代码来源:crawlerthread.py
示例8: spider
def spider(self):
# 请求头增加cc
s = Spider(additional_headers={'Cache-Control': 'max-age=0'})
try:
s.fetch(self.url)
except HTTPError as e:
# 检查该电影相关页面是否存在
if e.msg == 'Not Found':
return
# 因为中文被编码成utf-8之后变成'/u2541'之类的形式,lxml一遇到"/"就会认为其标签结束
return etree.HTML(s.content.decode('utf-8'))
开发者ID:StevenLOL,项目名称:Mtime,代码行数:11,代码来源:parse.py
示例9: main
def main():
args = command_parser()
target_url = args.target_url[0]
depth = int(args.depth[0])
log_level = int(args.log_level)
log_file = args.log_file
thread_number = int(args.thread_number)
key = args.key
db_file = args.db_file
test_self = args.test_self
spider = Spider(target_url, depth=depth, thread_number=thread_number)
spider.start()
开发者ID:franciumzh,项目名称:spider,代码行数:12,代码来源:main.py
示例10: __init__
def __init__(self, master):
self.master = master
east_group = LabelFrame(master, text='东部')
east_group.grid(row=0, column=0, padx=5, pady=5)
west_group = LabelFrame(master, text='西部')
west_group.grid(row=1, column=0, padx=5, pady=5)
# 东部排名
east_ranking = LabelFrame(master, text='东部排名')
east_ranking.grid(row=0, column=1, rowspan=2, padx=5, pady=5, sticky=N)
self.east_ranking_list = self.creat_teams_ranking_list(east_ranking)
# 西部排名
west_ranking = LabelFrame(master, text='西部排名')
west_ranking.grid(row=0, column=2, rowspan=2, padx=5, pady=5, sticky=N)
self.west_ranking_list = self.creat_teams_ranking_list(west_ranking)
# 东部
atlantic_group = LabelFrame(east_group, text='大西洋区')
atlantic_group.grid(row=0, column=0, padx=5, pady=5)
central_group = LabelFrame(east_group, text='中部区')
central_group.grid(row=0, column=1, padx=5, pady=5)
southeast_group = LabelFrame(east_group, text='东南区')
southeast_group.grid(row=0, column=2, padx=5, pady=5)
# 西部
pacific_group = LabelFrame(west_group, text='太平洋区')
pacific_group.grid(row=1, column=0, padx=5, pady=5)
southwest_group = LabelFrame(west_group, text='西南区')
southwest_group.grid(row=1, column=1, padx=5, pady=5)
northwest_group = LabelFrame(west_group, text='西北区')
northwest_group.grid(row=1, column=2, padx=5, pady=5)
spider = Spider()
index_data = spider.load_teams_index()
teams_ranking_data = spider.load_teams_ranking()
analyzer = Analyzer()
teams_data = analyzer.analyze_teams_data(index_data)
self.teams_ranking = analyzer.analyze_teams_ranking(teams_ranking_data)
self.load_teams_ranking()
self.teams_logo = utils.load_teams_logos()
self.load_group(atlantic_group, teams_data[0:5])
self.load_group(pacific_group, teams_data[5:10])
self.load_group(central_group, teams_data[10:15])
self.load_group(southwest_group, teams_data[15:20])
self.load_group(southeast_group, teams_data[20:25])
self.load_group(northwest_group, teams_data[25:30])
开发者ID:Yuanlimakefun,项目名称:NBATeams,代码行数:52,代码来源:app_frame.py
示例11: work
def work():
while True:
url = queue.get()
table_name = 'url_title_rel'
title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
#print title
queue.task_done()
开发者ID:Changjinxing,项目名称:titleCrawler,代码行数:7,代码来源:main.py
示例12: grab_crawler
def grab_crawler(data):
bot = Spider()
bot.initial_urls = [data['site_url']]
bot.total = data['image_count']
bot.result_status = 'inprogress'
bot.image_type = data['image_type']
bot.run()
开发者ID:NewOldMax,项目名称:image-grabber,代码行数:7,代码来源:app.py
示例13: create_spider
def create_spider(self):
spider = Spider()
xml = parse(self._filename)
params = xml.getElementsByTagName(self._parameters)
if params is not None:
params = params[0]
pages = params.getElementsByTagName(self._page)
for page in pages:
print(page.firstChild.data)
spider.add_url(page.firstChild.data)
domains = params.getElementsByTagName(self._domain)
for domain in domains:
print(domain.firstChild.data)
spider.add_domain(domain.firstChild.data)
depth = params.getElementsByTagName(self._depth)
if depth is not None:
depth = depth[0]
print(depth.firstChild.data)
spider.set_max_depth(depth.firstChild.data)
return spider
开发者ID:2gisprojectT,项目名称:terehov-soundcloud,代码行数:25,代码来源:spider_xml_factory.py
示例14: work
def work():
while True:
url=queue.get()
Spider.crawl_page(threading.current_thread().name,url)
queue.task.done()
开发者ID:99sbr,项目名称:Scrapy-Spider,代码行数:5,代码来源:main.py
示例15: TestSpider
class TestSpider(unittest.TestCase):
def setUp(self):
self.test_spider = Spider("aladinfoods.bg")
def test_spider_init(self):
self.assertEqual(self.test_spider.scaned_url, [])
self.assertEqual(self.test_spider.domain, "aladinfoods.bg")
def test_is_outgoing(self):
self.assertFalse(self.test_spider.is_outgoing("http://aladinfoods.bg"))
def test_is_not_outgoing(self):
self.assertTrue(self.test_spider.is_outgoing("http://hackbulgaria.com"))
def test_is_valid(self):
self.assertTrue(self.test_spider.is_valid("http://aladinfoods.bg/menu"))
def test_is_not_valid(self):
self.assertFalse(self.test_spider.is_valid("http://hackbulgaria.com"))
开发者ID:AlexanderTankov,项目名称:SearchEngine,代码行数:20,代码来源:spider_test.py
示例16: downloadArchives
def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5):
'''Crawl <startURL> and find all mailing list archives (given the filename <extension>).
Store the files in the folder with the path <container>.
If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well).
'''
'''Set up downloader'''
queue = initDownloader(numThreads)
print 'Downloading archives from', startURL
if not lookInsideSubfolders:
spider = Spider(startURL)
spider.process_page(startURL)
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
print '%d archives' % (len(urlList))
addToQ(queue, urlList, container)
else:
spider = Spider(startURL)
spider.process_page(startURL)
for link in sorted(spider.URLs):
subspider = Spider(link)
subspider.process_page(link)
mlName = link.split('/')[-2]
'''Only the links to archive files are interesting:
mailing list archive file names end with '.txt.gz' '''
urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)]
if len(urlList):
print '%s: %d archives' % (mlName, len(urlList))
'''Create a folder for the mailing list'''
store = os.path.join(container, mlName)
if not (os.path.isdir(store)):
os.system("mkdir %s" % store)
addToQ(queue, urlList, store)
'''If here, download finished. Stop threads'''
stopDownloader(queue, numThreads)
开发者ID:cupescapone,项目名称:miningTools,代码行数:46,代码来源:mlArchivesDownloader.py
示例17: __init__
def __init__(self, url=None, index_path=None):
'''Initializes a WebFetcher instance and its base attributes'''
# the url which shouldbe fetched
self._url = url
self._base_url = self._url
file_ext = os.path.splitext(url)[1]
if file_ext:
# trim the file name
self._base_url = self._url[:self._url.rindex("/")]
# the path where everything will be saved
self._index_path = index_path
self._spider = Spider()
self._CSS_RE = re.compile(r"url\(([^\)]*)\)")
开发者ID:heitara,项目名称:webfetcher,代码行数:13,代码来源:webfetcher.py
示例18: get
def get(self, *args, **kwargs):
query = self.get_argument("query")
student_number = self.get_argument("stu_no")
password = self.get_argument("pwd")
spider = Spider(student_number=student_number, password=password)
spider.authorized()
student_id = spider.student_id
self.set_header("Content-Type", "application/json")
try:
if query == "semester":
storage = self.redis_cls.get_semester_grade(student_id)
if storage is None:
current = spider.parse_semester_grade()
if current.code != 0:
self.write(json.dumps(dict(current._asdict())))
return
data = current.data
else:
data = storage
elif query == "pass":
storage = self.redis_cls.get_passed_grade(student_id)
if storage is None:
current = spider.parse_passed_grade()
if current.code != 0:
self.write(json.dumps(dict(current._asdict())))
return
data = current.data
else:
data = storage
elif query == "fail":
storage = self.redis_cls.get_failed_grade(student_id)
if storage is None:
current = spider.parse_failed_grade()
if current.code != 0:
self.write(json.dumps(dict(current._asdict())))
return
data = current.data
else:
data = storage
else:
raise ValueError("Query Operation Out")
self.write(gen_response(code=0x0000,
data=data,
msg="成功",
en_msg="Success"))
except Exception as err:
self.write(gen_response(code=0x0001,
msg=str(err),
en_msg="Unknown Error"))
开发者ID:qiu0130,项目名称:usthAPI,代码行数:54,代码来源:handler.py
示例19: __init__
def __init__(self, master, team_id):
self.team_info_win = Toplevel(master)
self.team_info_win.resizable(False, False)
self.team_id = team_id
self.spider = Spider()
team_info_data = self.spider.load_team_info(team_id)
analyzer = Analyzer()
self.team_info = analyzer.analyze_team_info(team_info_data)
(self.team_average, self.team_leader) = analyzer.analyze_team_data_leader(team_info_data)
self.team_info_win.title(self.team_info.name)
self.load_team_introduction()
self.load_team_data()
self.load_players()
开发者ID:Yuanlimakefun,项目名称:NBATeams,代码行数:17,代码来源:team_info_frame.py
示例20: setUp
def setUp(self):
"""Set up"""
self.spider_q = Queue()
self.db_q = Queue()
self.url_q = Queue()
for i in range(5):
self.spider = Spider(self.spider_q, self.db_q, self.url_q,
self.start, blacklist=(os.path.abspath(
'blacklist.txt')))
self.spider.setDaemon(True)
self.spider.start()
self.pages = ['http://exchanges.state.gov/heritage/index.html',
'http://exchanges.state.gov/heritage/iraq.html',
'http://exchanges.state.gov/heritage/special.html',
'http://exchanges.state.gov/heritage/culprop.html',
'http://exchanges.state.gov/heritage/afcp.html']
self.start = pages[0]
self.soups = [BeautifulSoup(requests.get(page).text) for page in
self.pages]
for soup in self.soups:
self.spider_q.get(soup)
self.spider_q.join()
self.soup = soups[0]
开发者ID:NathanKleekamp,项目名称:pdf-scraper,代码行数:23,代码来源:test_spider.py
注:本文中的spider.Spider类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论