本文整理汇总了Python中pyspider.fetcher.tornado_fetcher.Fetcher类的典型用法代码示例。如果您正苦于以下问题:Python Fetcher类的具体用法?Python Fetcher怎么用?Python Fetcher使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Fetcher类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: run_fetcher
def run_fetcher(g=g):
from pyspider.fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
fetcher.phantomjs_proxy = g.phantomjs_proxy
run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
fetcher.run()
开发者ID:BCriswell,项目名称:pyspider,代码行数:7,代码来源:run.py
示例2: fetcher
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port):
g = ctx.obj
from pyspider.fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
fetcher.phantomjs_proxy = g.phantomjs_proxy
g.instances.append(fetcher)
if xmlrpc:
run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
fetcher.run()
开发者ID:YORYOR,项目名称:pyspider,代码行数:10,代码来源:run.py
示例3: setUpClass
def setUpClass(self):
import tests.data_test_webpage
import httpbin
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
self.httpbin = "http://127.0.0.1:14887"
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.proxy_thread = subprocess.Popen(
["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
)
self.proxy = "127.0.0.1:14830"
try:
self.phantomjs = subprocess.Popen(
[
"phantomjs",
os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
"25555",
]
)
except OSError:
self.phantomjs = None
time.sleep(0.5)
开发者ID:appleboy1977,项目名称:pyspider,代码行数:29,代码来源:test_fetcher.py
示例4: setUpClass
def setUpClass(self):
import tests.data_test_webpage
import httpbin
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
self.httpbin = 'http://127.0.0.1:14887'
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
'--password=123456', '--port=14830',
'--debug'], close_fds=True)
self.proxy = '127.0.0.1:14830'
try:
self.phantomjs = subprocess.Popen(['phantomjs',
os.path.join(os.path.dirname(__file__),
'../pyspider/fetcher/phantomjs_fetcher.js'),
'25555'])
except OSError:
self.phantomjs = None
time.sleep(0.5)
开发者ID:eromoe,项目名称:pyspider,代码行数:26,代码来源:test_fetcher.py
示例5: setUpClass
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
开发者ID:BCriswell,项目名称:pyspider,代码行数:7,代码来源:test_fetcher.py
示例6: webui
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc,
max_rate, max_burst, username, password):
g = ctx.obj
from pyspider.webui.app import app
app.config['taskdb'] = g.taskdb
app.config['projectdb'] = g.projectdb
app.config['resultdb'] = g.resultdb
app.config['cdn'] = cdn
if max_rate:
app.config['max_rate'] = max_rate
if max_burst:
app.config['max_burst'] = max_burst
if username:
app.config['webui_username'] = username
if password:
app.config['webui_password'] = password
# fetcher rpc
if isinstance(fetcher_rpc, basestring):
fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
if fetcher_rpc is None:
from pyspider.fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
fetcher.phantomjs_proxy = g.phantomjs_proxy
app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
else:
import umsgpack
app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data)
if isinstance(scheduler_rpc, basestring):
scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % (
os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
elif scheduler_rpc is None:
app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/')
else:
app.config['scheduler_rpc'] = scheduler_rpc
app.debug = g.debug
if g.get('testing_mode'):
return app
app.run(host=host, port=port)
开发者ID:Debug-Orz,项目名称:pyspider,代码行数:45,代码来源:run.py
示例7: setUpClass
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = 'localhost:25555'
self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.phantomjs = subprocess.Popen(['phantomjs',
os.path.join(os.path.dirname(__file__),
'../pyspider/fetcher/phantomjs_fetcher.js'),
'25555'])
开发者ID:aleemb,项目名称:pyspider,代码行数:12,代码来源:test_fetcher.py
示例8: run_webui
def run_webui(g=g):
import cPickle as pickle
from pyspider.fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
fetcher.phantomjs_proxy = g.phantomjs_proxy
from pyspider.webui.app import app
app.config['taskdb'] = g.taskdb
app.config['projectdb'] = g.projectdb
app.config['resultdb'] = g.resultdb
app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
app.config['scheduler_rpc'] = g.scheduler_rpc
#app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
if g.demo_mode:
app.config['max_rate'] = 0.2
app.config['max_burst'] = 3.0
if 'WEBUI_USERNAME' in os.environ:
app.config['webui_username'] = os.environ['WEBUI_USERNAME']
app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
if not getattr(g, 'all_in_one', False):
app.debug = g.debug
app.run(host=g.webui_host, port=g.webui_port)
开发者ID:BCriswell,项目名称:pyspider,代码行数:23,代码来源:run.py
示例9: TestResponse
class TestResponse(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': '',
}
@classmethod
def setUpClass(self):
self.fetcher = Fetcher(None, None, async=False)
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
self.httpbin = 'http://127.0.0.1:14887'
time.sleep(0.5)
@classmethod
def tearDownClass(self):
self.httpbin_thread.terminate()
def get(self, url, **kwargs):
if not url.startswith('http://'):
url = self.httpbin + url
request = copy.deepcopy(self.sample_task_http)
request['url'] = url
request.update(kwargs)
task, result = self.fetcher.fetch(request)
response = rebuild_response(result)
self.assertEqual(response.status_code, 200, result)
return response
def test_10_html(self):
response = self.get('/html')
self.assertIsNotNone(response.doc('h1'))
def test_20_xml(self):
response = self.get('/xml')
self.assertIsNotNone(response.doc('item'))
def test_30_gzip(self):
response = self.get('/gzip')
self.assertIn('gzipped', response.text)
def test_40_deflate(self):
response = self.get('/deflate')
self.assertIn('deflated', response.text)
开发者ID:zhaoxiaojun,项目名称:pyspider,代码行数:44,代码来源:test_response.py
示例10: fetcher
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher):
g = ctx.obj
fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor,
poolsize=poolsize, proxy=proxy)
fetcher.phantomjs_proxy = g.phantomjs_proxy
if user_agent:
fetcher.user_agent = user_agent
if timeout:
fetcher.default_options = dict(fetcher.default_options)
fetcher.default_options['timeout'] = timeout
g.instances.append(fetcher)
if g.get('testing_mode'):
return fetcher
if xmlrpc:
run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
fetcher.run()
开发者ID:CKAKA,项目名称:pyspider,代码行数:18,代码来源:run.py
示例11: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': 'http://echo.opera.com/',
'fetch': {
'method': 'GET',
'headers': {
'Cookie': 'a=b',
'a': 'b'
},
'timeout': 60,
'save': 'abc',
},
'process': {
'callback': 'callback',
'save': [1, 2, 3],
},
}
@classmethod
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
@classmethod
def tearDownClass(self):
self.rpc._quit()
self.thread.join()
def test_10_http_get(self):
result = self.fetcher.sync_fetch(self.sample_task_http)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn('..A:', content)
self.assertIn('..Cookie:', content)
self.assertIn('a=b', content)
def test_10_http_post(self):
request = dict(self.sample_task_http)
request['fetch']['method'] = 'POST'
request['fetch']['data'] = 'binux'
request['fetch']['cookies'] = {'c': 'd'}
result = self.fetcher.sync_fetch(request)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn('<h2>POST', content)
self.assertIn('..A:', content)
self.assertIn('..Cookie:', content)
# FIXME: cookies in headers not supported
self.assertNotIn('a=b', content)
self.assertIn('c=d', content)
self.assertIn('binux', content)
def test_20_dataurl_get(self):
data = dict(self.sample_task_http)
data['url'] = 'data:,hello';
result = self.fetcher.sync_fetch(data)
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_30_with_queue(self):
data = dict(self.sample_task_http)
data['url'] = 'data:,hello';
self.inqueue.put(data)
task, result = self.outqueue.get()
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_40_with_rpc(self):
data = dict(self.sample_task_http)
data['url'] = 'data:,hello';
result = pickle.loads(self.rpc.fetch(data).data)
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
开发者ID:BCriswell,项目名称:pyspider,代码行数:89,代码来源:test_fetcher.py
示例12: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': 'http://echo.opera.com/',
'fetch': {
'method': 'GET',
'headers': {
'Cookie': 'a=b',
'a': 'b'
},
'cookies': {
'c': 'd',
},
'timeout': 60,
'save': 'abc',
},
'process': {
'callback': 'callback',
'save': [1, 2, 3],
},
}
@classmethod
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = 'localhost:25555'
self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.phantomjs = subprocess.Popen(['phantomjs',
os.path.join(os.path.dirname(__file__),
'../pyspider/fetcher/phantomjs_fetcher.js'),
'25555'])
@classmethod
def tearDownClass(self):
self.phantomjs.kill()
self.phantomjs.wait()
self.rpc._quit()
self.thread.join()
time.sleep(1)
def test_10_http_get(self):
result = self.fetcher.sync_fetch(self.sample_task_http)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn('<b>A:', content)
self.assertIn('<b>Cookie:</b>', content)
self.assertIn('c=d</td>', content)
def test_10_http_post(self):
request = copy.deepcopy(self.sample_task_http)
request['fetch']['method'] = 'POST'
request['fetch']['data'] = 'binux'
request['fetch']['cookies'] = {'c': 'd'}
result = self.fetcher.sync_fetch(request)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn('<h2>POST', content)
self.assertIn('..A:', content)
self.assertIn('..Cookie:', content)
# FIXME: cookies in headers not supported
self.assertNotIn('a=b', content)
self.assertIn('c=d', content)
self.assertIn('binux', content)
def test_20_dataurl_get(self):
data = copy.deepcopy(self.sample_task_http)
data['url'] = 'data:,hello'
result = self.fetcher.sync_fetch(data)
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_30_with_queue(self):
data = copy.deepcopy(self.sample_task_http)
data['url'] = 'data:,hello'
self.inqueue.put(data)
task, result = self.outqueue.get()
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_40_with_rpc(self):
data = copy.deepcopy(self.sample_task_http)
data['url'] = 'data:,hello'
result = umsgpack.unpackb(self.rpc.fetch(data).data)
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
#.........这里部分代码省略.........
开发者ID:aleemb,项目名称:pyspider,代码行数:101,代码来源:test_fetcher.py
示例13: TestSplashFetcher
class TestSplashFetcher(unittest.TestCase):
@property
def sample_task_http(self):
return {
'taskid': 'taskid',
'project': 'project',
'url': '',
'fetch': {
'method': 'GET',
'headers': {
'Cookie': 'a=b',
'a': 'b'
},
'cookies': {
'c': 'd',
},
'timeout': 60,
'save': 'abc',
},
'process': {
'callback': 'callback',
'save': [1, 2, 3],
},
}
@classmethod
def setUpClass(self):
import tests.data_test_webpage
import httpbin
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
'--password=123456', '--port=14830',
'--debug'], close_fds=True)
self.proxy = '127.0.0.1:14830'
@classmethod
def tearDownClass(self):
self.proxy_thread.terminate()
self.proxy_thread.wait()
self.httpbin_thread.terminate()
self.httpbin_thread.join()
self.rpc._quit()
self.thread.join()
assert not utils.check_port_open(5000)
assert not utils.check_port_open(23333)
assert not utils.check_port_open(24444)
assert not utils.check_port_open(25555)
assert not utils.check_port_open(14887)
time.sleep(1)
def test_69_no_splash(self):
splash_endpoint = self.fetcher.splash_endpoint
self.fetcher.splash_endpoint = None
request = self.sample_task_http
request['url'] = self.httpbin + '/get'
request['fetch']['fetch_type'] = 'splash'
result = self.fetcher.sync_fetch(request)
response = rebuild_response(result)
self.assertEqual(response.status_code, 501, result)
self.fetcher.splash_endpoint = splash_endpoint
def test_70_splash_url(self):
request = self.sample_task_http
request['url'] = self.httpbin + '/get'
request['fetch']['fetch_type'] = 'splash'
result = self.fetcher.sync_fetch(request)
response = rebuild_response(result)
self.assertEqual(response.status_code, 200, result)
self.assertEqual(response.orig_url, request['url'])
self.assertEqual(response.save, request['fetch']['save'])
data = json.loads(response.doc('pre').text())
self.assertIsNotNone(data, response.content)
self.assertEqual(data['headers'].get('A'), 'b', response.json)
self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)
def test_75_splash_robots(self):
request = self.sample_task_http
request['url'] = self.httpbin + '/deny'
request['fetch']['fetch_type'] = 'splash'
request['fetch']['robots_txt'] = True
result = self.fetcher.sync_fetch(request)
response = rebuild_response(result)
#.........这里部分代码省略.........
开发者ID:eromoe,项目名称:pyspider,代码行数:101,代码来源:test_fetcher.py
示例14: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': '',
'fetch': {
'method': 'GET',
'headers': {
'Cookie': 'a=b',
'a': 'b'
},
'cookies': {
'c': 'd',
},
'timeout': 60,
'save': 'abc',
},
'process': {
'callback': 'callback',
'save': [1, 2, 3],
},
}
@classmethod
def setUpClass(self):
import tests.data_test_webpage
import httpbin
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
self.httpbin = 'http://127.0.0.1:14887'
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
'--password=123456', '--port=14830',
'--debug'], close_fds=True)
self.proxy = '127.0.0.1:14830'
try:
self.phantomjs = subprocess.Popen(['phantomjs',
os.path.join(os.path.dirname(__file__),
'../pyspider/fetcher/phantomjs_fetcher.js'),
'25555'])
except OSError:
self.phantomjs = None
time.sleep(0.5)
@classmethod
def tearDownClass(self):
self.proxy_thread.terminate()
self.proxy_thread.wait()
self.httpbin_thread.terminate()
self.httpbin_thread.join()
if self.phantomjs:
self.phantomjs.kill()
self.phantomjs.wait()
self.rpc._quit()
self.thread.join()
assert not utils.check_port_open(5000)
assert not utils.check_port_open(23333)
assert not utils.check_port_open(24444)
assert not utils.check_port_open(25555)
assert not utils.check_port_open(14887)
time.sleep(1)
def test_10_http_get(self):
request = copy.deepcopy(self.sample_task_http)
request['url'] = self.httpbin+'/get'
result = self.fetcher.sync_fetch(request)
response = rebuild_response(result)
self.assertEqual(response.status_code, 200, result)
self.assertEqual(response.orig_url, request['url'])
self.assertEqual(response.save, request['fetch']['save'])
self.assertIsNotNone(response.json, response.content)
self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
def test_15_http_post(self):
request = copy.deepcopy(self.sample_task_http)
request['url'] = self.httpbin+'/post'
request['fetch']['method'] = 'POST'
request['fetch']['data'] = 'binux'
request['fetch']['cookies'] = {'c': 'd'}
result = self.fetcher.sync_fetch(request)
response = rebuild_response(result)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.orig_url, request['url'])
self.assertEqual(response.save, request['fetch']['save'])
self.assertIsNotNone(response.json, response.content)
#.........这里部分代码省略.........
开发者ID:eromoe,项目名称:pyspider,代码行数:101,代码来源:test_fetcher.py
示例15: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': 'http://echo.opera.com/',
'fetch': {
'method': 'GET',
'headers': {
'Cookie': 'a=b',
'a': 'b'
},
'cookies': {
'c': 'd',
},
'timeout': 60,
'save': 'abc',
},
'process': {
'callback': 'callback',
'save': [1, 2, 3],
},
}
@classmethod
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
try:
self.phantomjs = subprocess.Popen(['phantomjs',
os.path.join(os.path.dirname(__file__),
'../pyspider/fetcher/phantomjs_fetcher.js'),
'25555'])
except OSError:
self.phantomjs = None
@classmethod
def tearDownClass(self):
if self.phantomjs:
self.phantomjs.kill()
self.phantomjs.wait()
self.rpc._quit()
self.thread.join()
time.sleep(1)
def test_10_http_get(self):
result = self.fetcher.sync_fetch(self.sample_task_http)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn(b'<b>A:', content)
self.assertIn(b'<b>Cookie:</b>', content)
self.assertIn(b'c=d</td>', content)
def test_10_http_post(self):
request = copy.deepcopy(self.sample_task_http)
request['fetch']['method'] = 'POST'
request['fetch']['data'] = 'binux'
request['fetch']['cookies'] = {'c': 'd'}
result = self.fetcher.sync_fetch(request)
self.assertEqual(result['status_code'], 200)
self.assertEqual(result['orig_url'], self.sample_task_http['url'])
self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
self.assertIn('content', result)
content = result['content']
self.assertIn(b'<h2>POST', content)
self.assertIn(b'A:', content)
self.assertIn(b'Cookie:', content)
# FIXME: cookies in headers not supported
self.assertNotIn(b'a=b', content)
self.assertIn(b'c=d', content)
self.assertIn(b'binux', content)
def test_20_dataurl_get(self):
data = copy.deepcopy(self.sample_task_http)
data['url'] = 'data:,hello'
result = self.fetcher.sync_fetch(data)
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_30_with_queue(self):
data = copy.deepcopy(self.sample_task_http)
data['url'] = 'data:,hello'
self.inqueue.put(data)
task, result = self.outqueue.get()
self.assertEqual(result['status_code'], 200)
self.assertIn('content', result)
self.assertEqual(result['content'], 'hello')
def test_40_with_rpc(self):
data = copy.deepcopy(self.sample_task_http)
#.........这里部分代码省略.........
开发者ID:0xa-cc,项目名称:pyspider,代码行数:101,代码来源:test_fetcher.py
示例16: run_fetcher
def run_fetcher(g=g):
from pyspider.fetcher.tornado_fetcher import Fetcher
fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
g.fetcher = fetcher
run_in_thread(fetcher.xmlrpc_run)
fetcher.run()
开发者ID:CoralResort,项目名称:pyspider,代码行数:6,代码来源:run.py
示例17: TestResponse
class TestResponse(unittest.TestCase):
sample_task_http = {
'taskid': 'taskid',
'project': 'project',
'url': '',
}
@classmethod
def setUpClass(self):
self.fetcher = Fetcher(None, None, async=False)
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
self.httpbin = 'http://127.0.0.1:14887'
time.sleep(0.5)
@classmethod
def tearDownClass(self):
self.httpbin_thread.terminate()
def get(self, url, **kwargs):
if not url.startswith('http://'):
url = self.httpbin + url
request = copy.deepcopy(self.sample_task_http)
request['url'] = url
request.update(kwargs)
result = self.fetcher.fetch(request)
response = rebuild_response(result)
return response
def test_10_html(self):
response = self.get('/html')
self.assertEqual(response.status_code, 200)
self.assertIsNotNone(response.doc('h1'))
def test_20_xml(self):
response = self.get('/xml')
self.assertEqual(response.status_code, 200)
self.assertIsNotNone(response.doc('item'))
def test_30_gzip(self):
response = self.get('/gzip')
self.assertEqual(response.status_code, 200)
self.assertIn('gzipped', response.text)
def test_40_deflate(self):
response = self.get('/deflate')
self.assertEqual(response.status_code, 200)
self.assertIn('deflated', response.text)
def test_50_ok(self):
response = self.get('/status/200')
self.assertTrue(response.ok)
self.assertTrue(response)
response = self.get('/status/302')
self.assertTrue(response.ok)
self.assertTrue(response)
with self.assertRaises(Exception):
self.raise_for_status(allow_redirects=False)
def test_60_not_ok(self):
response = self.get('/status/400')
self.assertFalse(response.ok)
self.assertFalse(response)
response = self.get('/status/500')
self.assertFalse(response.ok)
self.assertFalse(response)
response = self.get('/status/600')
self.assertFalse(response.ok)
self.assertFalse(response)
def test_70_reraise_exception(self):
response = self.get('file://abc')
with self.assertRaisesRegexp(Exception, 'HTTP 599'):
response.raise_for_status()
开发者ID:Dmitry-Kucher,项目名称:pyspider,代码行数:73,代码来源:test_response.py
示例18: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
"taskid": "taskid",
"project": "project",
"url": "http://echo.opera.com/",
"fetch": {"method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "timeout": 60, "save": "abc"},
"process": {"callback": "callback", "save": [1, 2, 3]},
}
@classmethod
def setUpClass(self):
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.rpc = xmlrpclib.ServerProxy("http://localhost:%d" % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
@classmethod
def tearDownClass(self):
self.rpc._quit()
self.thread.join()
time.sleep(1)
def test_10_http_get(self):
result = self.fetcher.sync_fetch(self.sample_task_http)
self.assertEqual(result["status_code"], 200)
self.assertEqual(result["orig_url"], self.sample_task_http["url"])
self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
self.assertIn("content", result)
content = result["content"]
self.assertIn("..A:", content)
self.assertIn("..Cookie:", content)
self.assertIn("a=b", content)
def test_10_http_post(self):
request = dict(self.sample_task_http)
request["fetch"]["method"] = "POST"
request["fetch"]["data"] = "binux"
request["fetch"]["cookies"] = {"c": "d"}
result = self.fetcher.sync_fetch(request)
self.assertEqual(result["status_code"], 200)
self.assertEqual(result["orig_url"], self.sample_task_http["url"])
self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
self.assertIn("content", result)
content = result["content"]
self.assertIn("<h2>POST", content)
self.assertIn("..A:", content)
self.assertIn("..Cookie:", content)
# FIXME: cookies in headers not supported
self.assertNotIn("a=b", content)
self.assertIn("c=d", content)
self.assertIn("binux", content)
def test_20_dataurl_get(self):
data = dict(self.sample_task_http)
data["url"] = "data:,hello"
result = self.fetcher.sync_fetch(data)
self.assertEqual(result["status_code"], 200)
self.assertIn("content", result)
self.assertEqual(result["content"], "hello")
def test_30_with_queue(self):
data = dict(self.sample_task_http)
data["url"] = "data:,hello"
self.inqueue.put(data)
task, result = self.outqueue.get()
self.assertEqual(result["status_code"], 200)
self.assertIn("content", result)
self.assertEqual(result["content"], "hello")
def test_40_with_rpc(self):
data = dict(self.sample_task_http)
data["url"] = "data:,hello"
result = umsgpack.unpackb(self.rpc.fetch(data).data)
self.assertEqual(result["status_code"], 200)
self.assertIn("content", result)
self.assertEqual(result["content"], "hello")
def test_50_base64_data(self):
request = dict(self.sample_task_http)
request["fetch"]["method"] = "POST"
request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
self.inqueue.put(request)
task, result = self.outqueue.get()
self.assertEqual(result["status_code"], 200)
self.assertIn(" d6 ", result["content"])
self.assertIn(" d0 ", result["content"])
self.assertIn(" ce ", result["content"])
self.assertIn(" c4 ", result["content"])
开发者ID:hemengsi123,项目名称:pyspider,代码行数:92,代码来源:test_fetcher.py
示例19: setUpClass
def setUpClass(self):
self.fetcher = Fetcher(None, None, async=False)
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
self.httpbin = 'http://127.0.0.1:14887'
time.sleep(0.5)
开发者ID:Dmitry-Kucher,项目名称:pyspider,代码行数:5,代码来源:test_response.py
示例20: TestFetcher
class TestFetcher(unittest.TestCase):
sample_task_http = {
"taskid": "taskid",
"project": "project",
"url": "",
"fetch": {
"method": "GET",
"headers": {"Cookie": "a=b", "a": "b"},
"cookies": {"c": "d"},
"timeout": 60,
"save": "abc",
},
"process": {"callback": "callback", "save": [1, 2, 3]},
}
@classmethod
def setUpClass(self):
import tests.data_test_webpage
import httpbin
self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
self.httpbin = "http://127.0.0.1:14887"
self.inqueue = Queue(10)
self.outqueue = Queue(10)
self.fetcher = Fetcher(self.inqueue, self.outqueue)
self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
self.thread = utils.run_in_thread(self.fetcher.run)
self.proxy_thread = subprocess.Popen(
["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
)
self.proxy = "127.0.0.1:14830"
try:
self.phantomjs = subprocess.Popen(
[
"phantomjs",
os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
"25555",
]
)
except OSError:
self.phantomjs = None
time.sleep(0.5)
@classmethod
def tearDownClass(self):
self.proxy_thread.terminate()
self.proxy_thread.wait()
self.httpbin_thread.terminate()
self.httpbin_thread.join()
if self.phantomjs:
self.phanto
|
请发表评论