• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python webpage.WebPage类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中webpage.WebPage的典型用法代码示例。如果您正苦于以下问题:Python WebPage类的具体用法?Python WebPage怎么用?Python WebPage使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了WebPage类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: set_data

    def set_data(self):
        w=WebPage(htmldata=self.htmldata1)

        #find all the codes for season/year in the first html form data
        self.semesters={}
        xpath="""//*[@id="CLASS_SRCH_WRK2_STRM$35$"]/option"""
        for e in w.get_from_xpath(xpath):
            key,semester=e.text,e.get("value")
            if key.strip() and semester.strip():
                self.semesters[key]=semester

        #match up season/year to the codes we just found, if possible
        code=0
        for key in self.semesters:
            if self.season.lower() in key.lower() and str(self.year) in key:
                code=self.semesters[key]
                break

        if not code:
            print_color("Warning: failed to find season/year in search options. season='%s' year='%s'"%(self.season,self.year),COLORS.RED)
            print_d("search options",self.semesters)

        self.data={"ICFocus":"SSR_CLSRCH_WRK_ACAD_CAREER$2",
                "CLASS_SRCH_WRK2_STRM$35$":str(code),
                "SSR_CLSRCH_WRK_SUBJECT$0":self.department,
                "ICAction":"CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH",
                "SSR_CLSRCH_WRK_ACAD_CAREER$2":self.level}
开发者ID:Zulban,项目名称:zconcordia,代码行数:27,代码来源:concordia_search.py


示例2: Crawler

class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
    
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
                
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
开发者ID:ricocmc,项目名称:pythonTraining,代码行数:58,代码来源:crawler.py


示例3: Crawler

class Crawler():
    def __init__(self):
        self.downloader = DownloadManager()
        self.webpage = None
        self.rules = {}
        self.dbop = OperatorDB()

    def add_seeds(self, links):
        self.dbop.add_seeds(links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            try:
                url = self.dbop.pop_url()
                print "url: %s" % url
                if url == None:
                    print "crawling task is done."
                    break
                error_msg, url, redirected_url, html = self.downloader.download(url)
                #print error_msg, url, redirected_url, html
                if html != None:
                    self.webpage = WebPage(url, html)
                    article = self.webpage.extract()
                    if len(article) > 5:
                        addtime = "%s %s" % (article[1], article[2])
                        self.dbop.html2db(url, html,
                                          article[0],
                                          addtime,
                                          article[3],
                                          article[5])
                    else:
                        self.dbop.html2db(url, html)
                    print self.webpage.parse_links()
                    ruptn = self.get_patterns_from_rules(url)
                    links = self.webpage.filter_links(tags=['a'],
                                                      str_patterns=ruptn)
                    self.add_seeds(links)
                self.mysleep(3)
            except Exception, err:
               print "!!error!! Exception happend! %s %s" % (url, err)
               self.dbop.close()
开发者ID:agentwx,项目名称:auCrawler,代码行数:57,代码来源:crawler.py


示例4: updateView

    def updateView(self):
        page = WebPage(logger=None, parent=self)
        page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks)
        page.mainFrame().addToJavaScriptWindowObject("qtWindow", self)
        self.ui.webView.setPage(page)

        html = self.generateHtml()
        # baseUrl must end with a trailing '/' otherwise QWebView won't be able
        # to load files from there
        baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/"))
        self.ui.webView.setHtml(html, baseUrl)
开发者ID:agateau,项目名称:qyok,代码行数:11,代码来源:mainwindow.py


示例5: __init__

 def __init__(self):
     logging.debug("-->")
     super(WebBrowser, self).__init__()
     self.app = QApplication.instance()
     if self.app is None:
         self.app = QApplication(sys.argv)
         self.app.setQuitOnLastWindowClosed(False)
     self.event_loop = QEventLoop()
     self.cookie_jar = CookieJar()
     self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888)
     self.network_manager = NetworkAccessManager() 
     self.network_manager.setCookieJar(self.cookie_jar)
     # self.network_manager.setProxy(self.proxy)
     self.web_page = WebPage()        
     self.web_page.setNetworkAccessManager(self.network_manager)
     self.web_view = QWebView()
     self.web_view.setPage(self.web_page)        
     self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False)
     self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
     self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
     # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False)
     self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) 
     self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)
     self.page_loaded_validator = None
     self.page_loaded_handler = None
     self.page_loaded_handler_kwargs = None
     self.timeout_message = None
     self.timer = None
     self.event_loop_exception = None
     logging.debug("<--")
开发者ID:mguillech,项目名称:tfb-scraper,代码行数:30,代码来源:webbrowser.py


示例6: getlinks

 def getlinks(self,url,html):
     self.webpage = WebPage(url,html)
     self.webpage.parse_links()
     ruptn = self.get_patterns_from_rules(url)
     #print ruptn
     links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
     return links
开发者ID:acha21,项目名称:python-crawler,代码行数:7,代码来源:crawler.py


示例7: start

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files
开发者ID:gsliu,项目名称:Cybertron,代码行数:26,代码来源:crawler.py


示例8: start

 def start(self):
     while 1:
         try:
             url = self.dbop.pop_url()
             print "url: %s" % url
             if url == None:
                 print "crawling task is done."
                 break
             error_msg, url, redirected_url, html = self.downloader.download(url)
             #print error_msg, url, redirected_url, html
             if html != None:
                 self.webpage = WebPage(url, html)
                 article = self.webpage.extract()
                 if len(article) > 5:
                     addtime = "%s %s" % (article[1], article[2])
                     self.dbop.html2db(url, html,
                                       article[0],
                                       addtime,
                                       article[3],
                                       article[5])
                 else:
                     self.dbop.html2db(url, html)
                 print self.webpage.parse_links()
                 ruptn = self.get_patterns_from_rules(url)
                 links = self.webpage.filter_links(tags=['a'],
                                                   str_patterns=ruptn)
                 self.add_seeds(links)
             self.mysleep(3)
         except Exception, err:
            print "!!error!! Exception happend! %s %s" % (url, err)
            self.dbop.close()
开发者ID:agentwx,项目名称:auCrawler,代码行数:31,代码来源:crawler.py


示例9: __init__

    def __init__(self, parent, args):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_scriptFile = args.script
        self.m_args = args.script_args

        self.m_filesystem = FileSystem(self)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(self, args.disk_cache, args.cookies, args.ignore_ssl_errors)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localAccessRemote'] = args.local_access_remote
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)
        self.m_page.mainFrame().addToJavaScriptWindowObject('fs', self.m_filesystem)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            sys.exit('Can not bootstrap!')
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            sys.exit('Can not bootstrap!')
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost')
开发者ID:aivaturi,项目名称:phantomjs,代码行数:56,代码来源:phantom.py


示例10: __init__

    def __init__(self, args, parent = None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = QString()
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = QString.fromUtf8(args.script[0].read())
        self.m_scriptFile = args.script[0].name
        self.m_args = args.script[1:]
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script[0].close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
        self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
        self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

        # if our script was called in a different directory, change to it
        # to make any dealings with files be relative to the scripts directory
        if os.path.dirname(self.m_scriptFile):
            os.chdir(os.path.dirname(self.m_scriptFile))

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject)
        self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
开发者ID:westonruter,项目名称:phantomjs,代码行数:55,代码来源:phantom.py


示例11: __init__

    def __init__(self, args, parent = None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_loadStatus = self.m_state = ''
        self.m_var = self.m_paperSize = self.m_loadScript_cache = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_clipRect = QRect()
        # setup the values from args
        self.m_script = args.script.read()
        self.m_scriptFile = args.script.name
        self.m_scriptDir = os.path.dirname(args.script.name)
        if sys.platform.startswith('win'):
            self.m_scriptDir += '\\'
        else:
            self.m_scriptDir += '/'
        self.m_args = args.script_args
        self.m_upload_file = args.upload_file
        autoLoadImages = False if args.load_images == 'no' else True
        pluginsEnabled = True if args.load_plugins == 'yes' else False

        args.script.close()

        palette = self.m_page.palette()
        palette.setBrush(QPalette.Base, Qt.transparent)
        self.m_page.setPalette(palette)

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages)
        self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled)
        self.m_page.settings().setAttribute(QWebSettings.FrameFlatteningEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.OfflineStorageDatabaseEnabled, True)
        self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True)
        self.m_page.settings().setLocalStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))
        self.m_page.settings().setOfflineStoragePath(QDesktopServices.storageLocation(QDesktopServices.DataLocation))

        # Ensure we have a document.body.
        self.m_page.mainFrame().setHtml('<html><body></body></html>')

        self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
        self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)

        if self.m_verbose:
            m_netAccessMan = NetworkAccessManager(args.disk_cache, self)
            self.m_page.setNetworkAccessManager(m_netAccessMan)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().javaScriptWindowObjectCleared.connect(self.inject)
        self.m_page.loadFinished.connect(self.finish)
开发者ID:NathanHowell,项目名称:phantomjs-waf,代码行数:55,代码来源:phantom.py


示例12: run

 def run(self):
     while self.status:
         try:
             url = self.spider.task_list.get(timeout = 1)
         except Empty:
             # log.info('%s: task_list Empty' % self.name)
             continue
         self.spider.increase_running()
         if not self.spider.check_robots(url):
             log.info('%s - robots forbidden : %s' % (self.name, url))
             continue
         page = WebPage(url)
         # print('%s prepare to fetch %s' % (self.name, url))
         if page.fetch():
             self.spider.db.save_data(page.get_data())
             for link in page.get_link():                        # retrive links from html
                 if link not in self.spider.visited_list:        # not visited yet
                     self.spider.extend_list.add(link)
         else:
             print('%s: Page fetch failed: %s' % (self.name, page.url))
         self.spider.decrease_running()
开发者ID:YvesChan,项目名称:OpenSP,代码行数:21,代码来源:spider.py


示例13: createWebPage

 def createWebPage(self):
     page = WebPage(self)
     self.m_pages.append(page)
     page.applySettings(self.m_defaultPageSettings)
     page.setNetworkAccessManager(self.m_netAccessMan)
     page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
     return page
开发者ID:tryhendri,项目名称:phantomjs,代码行数:7,代码来源:phantom.py


示例14: __init__

    def __init__(self, args, parent=None):
        QObject.__init__(self, parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_verbose = args.verbose
        self.m_page = WebPage(self)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.m_script = args.script
        self.m_scriptFile = args.script_name
        self.m_args = args.script_args

        do_action('PhantomInitPre', Bunch(locals()))

        if not args.proxy:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        # Provide WebPage with a non-standard Network Access Manager
        self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self)
        self.m_page.setNetworkAccessManager(self.m_netAccessMan)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_page.applySettings(self.m_defaultPageSettings)

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        bootstrap = QFile(':/bootstrap.js')
        if not bootstrap.open(QFile.ReadOnly):
            qCritical('Can not bootstrap!')
            sys.exit(1)
        bootstrapper = str(bootstrap.readAll())
        bootstrap.close()
        if not bootstrapper:
            qCritical('Can not bootstrap!')
            sys.exit(1)
        self.m_page.mainFrame().evaluateJavaScript(bootstrapper)

        do_action('PhantomInitPost', Bunch(locals()))
开发者ID:lautis,项目名称:phantomjs,代码行数:48,代码来源:phantom.py


示例15: add_webpage

 def add_webpage(self):
     webpage = WebPage(name='', description='', url='', load_content=False)
     webpage.name = raw_input('Name: ')
     webpage.description = raw_input('Description: ')
     webpage.url = raw_input('URL: ')
     webpage.update_timeout = int(raw_input('Update timeout: '))
     webpage.request_timeout = int(raw_input('Request timeout: '))
     webpage.data_offset = int(raw_input('Data offset: '))
     done = False
     while not done:
         confirm = raw_input('Save? (y/n)')
         if confirm in ['y', 'Y']:
             try:
                 webpage.current = webpage.retrieve()
             except ValueError, e:
                 logger.error('[!] Error: ' + str(e))
                 done = True
             self.__webpages.append(webpage)
             #self.start_updater()
         if confirm in ['y', 'Y', 'n', 'N']:
             done = True
开发者ID:piero,项目名称:WebDiff,代码行数:21,代码来源:console_interface.py


示例16: __init__

    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            bootstrap = str(f.readAll())
        self.m_page.mainFrame().evaluateJavaScript(bootstrap)

        do_action('PhantomInitPost')
开发者ID:jsnell,项目名称:phantomjs,代码行数:47,代码来源:phantom.py


示例17: start

 def start(self):
     while 1:
         url = self.queue.pop_url()
         print url
         if url == None:
             print "crawling task is done."
             break
         error_msg, url, redirected_url, html = self.downloader.download(url)
         #print error_msg, url, redirected_url, html
         if html !=None:
             self.webpagedb.html2db(url,html)
             
             self.webpage = WebPage(url,html)
             self.webpage.parse_links()
             ruptn = self.get_patterns_from_rules(url)
             print ruptn
             links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
             self.add_seeds(links)
         self.mysleep(3)        
开发者ID:ricocmc,项目名称:pythonTraining,代码行数:19,代码来源:crawler.py


示例18: start

 def start(self):
     while 1:
         url = self.queue.popUrl()
         print url
         if url == None:
             print "crawling task is done."
             break
         error_msg, url, redirected_url, html = self.downloader.download(url)
         #print error_msg, url, redirected_url, html
         if html !=None:
             self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
             
             self.webpage = WebPage(url,html)#开始解析网页
             self.webpage.parseLinks()#得到全部的超链接
             ruptn = self.get_patterns_from_rules(url)
             print ruptn
             links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
             if links:
                 self.addSeeds(links)
         self.mysleep(3)#休息一下再继续爬
开发者ID:ReedGuo,项目名称:framework,代码行数:20,代码来源:crawler.py


示例19: WebBrowser

class WebBrowser(QObject):    
    def __init__(self):
        logging.debug("-->")
        super(WebBrowser, self).__init__()
        self.app = QApplication.instance()
        if self.app is None:
            self.app = QApplication(sys.argv)
            self.app.setQuitOnLastWindowClosed(False)
        self.event_loop = QEventLoop()
        self.cookie_jar = CookieJar()
        self.proxy = QNetworkProxy(QNetworkProxy.HttpProxy, "127.0.1.1", 8888)
        self.network_manager = NetworkAccessManager() 
        self.network_manager.setCookieJar(self.cookie_jar)
        # self.network_manager.setProxy(self.proxy)
        self.web_page = WebPage()        
        self.web_page.setNetworkAccessManager(self.network_manager)
        self.web_view = QWebView()
        self.web_view.setPage(self.web_page)        
        self.web_view.settings().setAttribute(QWebSettings.AutoLoadImages,False)
        self.web_view.settings().setAttribute(QWebSettings.PluginsEnabled, True)
        self.web_view.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
        # self.web_view.settings().setAttribute(QWebSettings.XSSAuditingEnabled, False)
        self.web_view.settings().setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) 
        self.connect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)
        self.page_loaded_validator = None
        self.page_loaded_handler = None
        self.page_loaded_handler_kwargs = None
        self.timeout_message = None
        self.timer = None
        self.event_loop_exception = None
        logging.debug("<--")
                
    def network_reply_finished(self,reply):
        logging.debug("Reply received for: " + reply.request().url().toString())
        self.network_manager.request_queue[reply.request().url()] = "Completed"
        redirect_url = self.get_redirect_url(reply.attribute(QNetworkRequest.RedirectionTargetAttribute),reply.request().url())
        if redirect_url is not None:
            self.redirect(redirect_url,reply.request())
            
    def redirect(self,url,request):
        frame = self.find_frame_to_redirect(self.web_view.page().mainFrame(),request)
        if frame is not None:
            logging.debug("Redirecting to: " + url.toString())
            frame.load(url)                
    
    def find_frame_to_redirect(self,frame,request):
        if frame.requestedUrl() == request.url():
            return frame
        else:
            children = frame.childFrames()
            for child in children:
                frame_to_redirect = self.find_frame_to_redirect(child,request)
                if frame_to_redirect is not None:
                    return frame_to_redirect
            
    def get_redirect_url(self,possible_redirect_url, orig_requested_url):
        if possible_redirect_url is not None:
            if possible_redirect_url.isRelative():
                if orig_requested_url.isRelative():
                    return None
                possible_redirect_url.setScheme(orig_requested_url.scheme())
                possible_redirect_url.setHost(orig_requested_url.host())
            if orig_requested_url != possible_redirect_url:
                return possible_redirect_url
        
    def get_cookies(self):
        cookies = self.cookie_jar.allCookies()
        raw_cookies = []
        first = True
        for cookie in cookies:
            raw_cookies.append(cookie.toRawForm())
        return raw_cookies
        
    def set_cookies(self,raw_cookies):
        cookies = []
        for raw_cookie in raw_cookies:
            cookie_list = QNetworkCookie.parseCookies(raw_cookie)
            for cookie in cookie_list:
                cookies.append(cookie)
        self.cookie_jar.setAllCookies(cookies)
            
    def cleanup(self):
        logging.debug("-->")
        self.disconnect(self.web_view.page().networkAccessManager(),SIGNAL("finished(QNetworkReply*)"),self.network_reply_finished)        
        self.web_view.setParent(None)
        self.web_page.setParent(None)
        self.network_manager.setParent(None)        
        self.event_loop.setParent(None)
        self.setParent(None)        
        del self.web_view
        del self.web_page
        del self.network_manager
        del self.event_loop
        del self.app
        logging.debug("<--")
开发者ID:mguillech,项目名称:tfb-scraper,代码行数:95,代码来源:webbrowser.py


示例20: Phantom

class Phantom(QObject):
    def __init__(self, parent, args):
        super(Phantom, self).__init__(parent)

        # variable declarations
        self.m_defaultPageSettings = {}
        self.m_pages = []
        self.m_verbose = args.verbose
        self.m_page = WebPage(self, args)
        self.m_returnValue = 0
        self.m_terminated = False
        # setup the values from args
        self.app_args = args
        self.m_scriptFile = args.script
        self.m_args = args.script_args
        self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8')
        self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys)

        self.m_pages.append(self.m_page)

        do_action('PhantomInitPre')

        if args.proxy is None:
            QNetworkProxyFactory.setUseSystemConfiguration(True)
        else:
            proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1]))
            QNetworkProxy.setApplicationProxy(proxy)

        self.m_page.javaScriptConsoleMessageSent.connect(self.printConsoleMessage)

        self.m_defaultPageSettings['loadImages'] = args.load_images
        self.m_defaultPageSettings['loadPlugins'] = args.load_plugins
        self.m_defaultPageSettings['javascriptEnabled'] = True
        self.m_defaultPageSettings['XSSAuditingEnabled'] = False
        self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent()
        self.m_defaultPageSettings['localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access
        self.m_page.applySettings(self.m_defaultPageSettings)

        self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))

        # inject our properties and slots into javascript
        self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self)

        with QPyFile(':/bootstrap.js') as f:
            bootstrap = str(f.readAll())
        self.m_page.mainFrame().evaluateJavaScript(bootstrap)

        do_action('PhantomInitPost')

    def execute(self):
        injectJsInFrame(self.m_scriptFile, self.m_scriptEncoding.encoding, os.path.dirname(os.path.abspath(__file__)), self.m_page.mainFrame(), True)
        return not self.m_terminated

    def printConsoleMessage(self, message, lineNumber, source):
        if source:
            message = '%s:%d %s' % (source, lineNumber, message)
        print message

    def returnValue(self):
        return self.m_returnValue

    ##
    # Properties and methods exposed to JavaScript
    ##

    @pyqtProperty('QStringList')
    def args(self):
        return self.m_args

    @pyqtSlot(result=FileSystem)
    def createFilesystem(self):
        return FileSystem(self)

    @pyqtSlot(result=WebPage)
    def createWebPage(self):
        page = WebPage(self, self.app_args)
        self.m_pages.append(page)
        page.applySettings(self.m_defaultPageSettings)
        page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile))
        return page

    @pyqtProperty('QVariantMap')
    def defaultPageSettings(self):
        return self.m_defaultPageSettings

    @pyqtSlot()
    @pyqtSlot(int)
    def exit(self, code=0):
        self.m_terminated = True
        self.m_returnValue = code

        # stop javascript execution in start script;
        # delete all the pages C++ objects, then clear
        # the page list, and empty the Phantom page
        for page in self.m_pages:
            sip.delete(page)
        del self.m_pages[:]
        self.m_page = None

        QApplication.instance().exit(code)
#.........这里部分代码省略.........
开发者ID:jsnell,项目名称:phantomjs,代码行数:101,代码来源:phantom.py



注:本文中的webpage.WebPage类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python downloader.Downloader类代码示例发布时间:2022-05-26
下一篇:
Python compiler.webpack函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap