• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python sessioninfomanager.updateSessionInfo函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.sessioninfomanager.updateSessionInfo函数的典型用法代码示例。如果您正苦于以下问题:Python updateSessionInfo函数的具体用法?Python updateSessionInfo怎么用?Python updateSessionInfo使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了updateSessionInfo函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     self.__total_threads_count = 0
     self.__last_timestamp = datetime(1980, 1, 1)
     #The Maximum No of threads to process, Bcoz, not all the forums get
     #updated Everyday, At maximum It will 100
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'anandtechforums_maxthreads'))
     self.__setSoupForCurrentUri()
     while self.__getThreads():
         try:
             next_page_uri = self.soup.find('a', text='>',rel='Next').parent['href']
             data_dict = dict(parse_qsl(next_page_uri.split('?')[-1]))
             if 's' in data_dict.keys():
                 data_dict.pop('s')
             self.currenturi = self.__baseuri + 'forumdisplay.php?'+ urlencode(data_dict)                    
             self.__setSoupForCurrentUri()
         except:
             log.exception(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     log.info(self.log_msg('# of Tasks Added is %d'%len(self.linksOut)))
     #self.linksOut = []
     if self.linksOut:
         updateSessionInfo('Search', self.session_info_out, \
                 self.__last_timestamp , None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:31,代码来源:anandtechforumsconnector.py


示例2: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     self.__total_threads_count = 0
     self.__last_timestamp = datetime( 1980,1,1 )
     self.__setSoupForCurrentUri()
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'htcpedia_maxthreads'))
     
     while self.__getThreads():
         try:
             self.currenturi = self.currenturi = self.__removeSessionId('http://htcpedia.com/forum/'  + self.soup.find('a', rel='next')['href'])
             self.__setSoupForCurrentUri()
         except:
             log.info(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     if self.__links_to_process:
         updateSessionInfo('Search', self.session_info_out,\
                 self.__last_timestamp , None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     log.info(self.log_msg('# of tasks added is %d'%len(self.linksOut)))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:25,代码来源:htcpediaconnector.py


示例3: __createTasksForThreads

    def __createTasksForThreads(self):
        """
        This will create Tasks for the threads found on the given url
        The # of Tasks are limited by Config Variable
        """
        self.__current_thread_count = 0
        self.__last_timestamp = datetime(1980, 1, 1)
        self.__max_threads_count = int(tg.config.get(path='Connector', 
                                                     key='ivillage_maxthreads'))
        while self.__getThreads():
            try:
                link_next = self.soup.find('a', href=True, text='Next').parent['href']
                self.currenturi = link_next

                self.__setSoupForCurrentUri()
            except:
                log.exception(self.log_msg('Next Page link not found for url %s' % self.currenturi))
                break

        log.info('Total # of tasks found is %d' % len(self.linksOut))
        if self.linksOut:
            updateSessionInfo('Search', self.session_info_out, 
                              self.__last_timestamp , None, 'ForumThreadsPage', 
                              self.task.instance_data.get('update'))
        return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:25,代码来源:ivillageconnector.py


示例4: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     try:
         self.__total_threads_count = 0
         self.__last_timestamp = datetime( 1980,1,1 )
         #The Maximum No of threads to process, Bcoz, not all the forums get
         #updated Everyday, At maximum It will 100
         self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                             'talkandroid_maxthreads'))
         self.__setSoupForCurrentUri()
         while True:
             try:
                 if not self.__getThreads():
                     break
                 self.currenturi =  self.soup.find('a', text='>').parent['href']
                 self.__setSoupForCurrentUri()
             except:
                 log.info(self.log_msg('Next Page link not found for url \
                                                     %s'%self.currenturi))
                 break
         if self.linksOut:
             updateSessionInfo('Search', self.session_info_out,\
                     self.__last_timestamp , None, 'ForumThreadsPage', \
                     self.task.instance_data.get('update'))
         return True
     except:
         log.exception(self.log_msg('Exception while creating tasks for the url %s'\
                                                             %self.currenturi)) 
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:talkandroidconnector.py


示例5: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     log.info('hello')
     self.__current_thread_count = 0
     self.__last_timestamp = datetime(1980, 1, 1)
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'mdjunction_maxthreads'))
     while self.__getThreads():
         try:
             current_page_tag = self.soup.find('strong', text=re.compile('^\[\d+\]$'))
             self.currenturi = current_page_tag.findParent('td').find('a', text=str(int(current_page_tag[1:-1])+1)).parent['href']
             self.__setSoupForCurrentUri()
         except:
             log.exception(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     log.info('Total # of tasks found is %d'%len(self.linksOut))
     #self.linksOut = None
     if self.linksOut:
         updateSessionInfo('Search', self.session_info_out, \
                 self.__last_timestamp, None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:mdjunctionconnector.py


示例6: __createTasksForThreads

 def __createTasksForThreads(self):
         """
         This will create Tasks for the threads found on the given url
             The # of Tasks are limited by Config Variable
         """
         self.__setSoupForCurrentUri()
         self.__total_threads_count = 0
         self.__baseuri = 'http://baliforum.com'
         self.__last_timestamp =datetime(1980, 1, 1) 
         #The Maximum No of threads to process, Bcoz, not all the forums get
         #updated Everyday, At maximum It will 100
         self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'baliforum_maxthreads'))
         while self.__processForumUrl():
             try:
                 self.currenturi =self.soup.find('img', alt='Next page').findParent('a')['href']                    
                 self.__setSoupForCurrentUri()
             except:
                 log.info(self.log_msg('Next Page link not found for url \
                                                     %s'%self.currenturi))
                 break
         log.debug(self.log_msg('LINKSOUT: ' + str(len(self.linksOut))))
         #self.linksOut = [] # To Remove
         if self.linksOut:
             updateSessionInfo('Search', self.session_info_out, \
                         self.__last_timestamp , None, 'ForumThreadsPage', \
                         self.task.instance_data.get('update'))
         return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:baliforumconnector.py


示例7: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     self.__current_thread_count = 0
     self.__last_timestamp = datetime( 1980,1,1 )
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'fatwallet_maxthreads'))
     while self.__getThreads():
         try:
             headers = []
             next_tag = self.soup.find('input', value='Next 20')
             form_tag = next_tag.findParent('form')
             input_values = form_tag.findAll('input', type='hidden')
             for input_value in input_values:
                 headers.append((input_value['name'],input_value['value'] ))
             self.currenturi = 'http://www.fatwallet.com' + form_tag['action'] + '?' + urlencode(headers )
             self.__setSoupForCurrentUri()
         except:
             log.exception(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     log.info('Total # of tasks found is %d'%len(self.linksOut))
     #self.linksOut = None
     if self.linksOut:
         updateSessionInfo('Search', self.session_info_out,\
                 self.__last_timestamp , None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:fatwalletconnector.py


示例8: fetch

 def fetch(self):
     """
     Fetches the first RESULTS_ITERATIONS results as specified by the attributes, and populate the result links to self.linksOut
     """
     try:
         if re.match(".*\/threads[\/]?$",self.task.instance_data['uri']):
             self.last_timestamp = datetime(1,1,1)
             self.forum_name = re.findall('\/([^\/]+)\/threads\/?$', urlparse(self.task.instance_data['uri'])[2])[0]
             self.crawl_count = int(tg.config.get(path='Connector',key='microsoft_numresults'))
             self.count = 0
             self.done = False
             self.currenturi = self.task.instance_data['uri']+'?sort=firstpostdesc'
             while self.count< self.crawl_count and not self.done:
                 self.__getPageData()
             log.debug(self.log_msg("Length of linksout is %d"%(len(self.linksOut))))
             if self.linksOut:
                 updateSessionInfo('search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
         elif re.match(".*\/thread\/.*?$",self.task.instance_data['uri']):
             self.__getThread()
             self.__getQuestion()
             self.__getAnswers()
             return True
         else:
             log.exception(self.log_msg("Unassociated url %s"%(self.task.instance_data['uri'])))
             return False
     except:
         log.exception(self.log_msg("Exception occured in fetch()"))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:29,代码来源:microsoftsocialconnector.py


示例9: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     self.__total_threads_count = 0
     self.__baseuri = 'http://forums.seagate.com'
     self.__last_timestamp = datetime(1980, 1, 1)
     #The Maximum No of threads to process, Bcoz, not all the forums get
     #updated Everyday, At maximum It will 100
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'seagateforums_maxthreads'))
     self.__setSoupForCurrentUri()
     while self.__getThreads():
         try:
             self.currenturi = self.__baseuri + self.soup.find('a', \
                     text='Next').findParent('a')['href'].split(';')[0]
             self.__setSoupForCurrentUri()
         except:
             log.info(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     #self.linksOut = []
     if self.linksOut:
         updateSessionInfo('Search', self.session_info_out, \
                 self.__last_timestamp , None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:seagateforumsconnector.py


示例10: __createTasksForThreads

 def __createTasksForThreads(self):
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     self.__total_threads_count = 0
     self.__last_timestamp = datetime( 1980,1,1 )
     self.__setSoupForCurrentUri()
     self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                         'htchd2forum_maxthreads'))
     current_page_no = 1
     while self.__getThreads():
         try:
             current_page_no += 1
             self.currenturi = self.__removeSessionId([x for x in self.soup.findAll('a', 'navPages') if int(stripHtml(x.renderContents()))==current_page_no][0]['href'])
             self.__setSoupForCurrentUri()
         except:
             log.info(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
             break
     if self.__links_to_process:
         updateSessionInfo('Search', self.session_info_out,\
                 self.__last_timestamp , None, 'ForumThreadsPage', \
                 self.task.instance_data.get('update'))
     log.info(self.log_msg('# of tasks added is %d'%len(self.linksOut)))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:htchd2forumconnector.py


示例11: __createTasksForThreads

 def __createTasksForThreads(self):
     
     """
     This will create Tasks for the threads found on the given url
     The # of Tasks are limited by Config Variable
     """
     try:
                 
         self.__total_threads_count = 0
         self.__last_timestamp = datetime( 1980,1,1 )
         self.__max_threads_count = int(tg.config.get(path='Connector', key=\
                                             'iphoneforums_maxthreads'))
         self.__setSoupForCurrentUri()
         while self.__processForumUrl():
             try:
                 self.currenturi = self.soup.find('a',title = re.compile('Next Page - '))['href']
                 self.__setSoupForCurrentUri()
             except:
                 log.exception(self.log_msg('Next Page link not found for url \
                                                 %s'%self.currenturi))
                 break                
                 
         log.info(self.log_msg('LINKSOUT: ' + str(len(self.linksOut))))
         #self.linksOut = [] # To Remove
         if self.linksOut:
             updateSessionInfo('Search', self.session_info_out, \
                         self.__last_timestamp , None, 'ForumThreadsPage', \
                         self.task.instance_data.get('update'))
         return True  
     except:
         log.info(self.log_msg('Exception while creating tasks for the url %s'\
                                                      %self.currenturi)) 
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:iphoneforumsconnector.py


示例12: fetch

    def fetch(self):
        """
        Fetch of egg head cafe
        """
        self.genre="Review"
        try:
            self.base_url = 'http://www.eggheadcafe.com'
            self.parent_uri = self.currenturi
            self.total_posts_count = 0
            self.last_timestamp = datetime( 1980,1,1 )
            self.max_posts_count = int(tg.config.get(path='Connector',key='eggheadcafe_max_threads_to_process'))
            #headers={'Host':'www.eggheadcafe.com'}
            #headers['Referer'] = self.currenturi
            #data = dict(parse_qsl(self.currenturi.split('?')[-1]))
            if not 'forumtree.aspx' in self.currenturi:
                if not self.__setSoup():
                    log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                    return False
                self.__getParentPage()
                while True:
                    parent_soup = copy.copy(self.soup)
                    self.__addPosts()
                    try:
                        self.currenturi = self.base_url +  parent_soup.find('a',text='Next').parent['href']
                        if not self.__setSoup():
                            break
                    except:
                        log.info(self.log_msg('Next Page link not found'))
                        break
                return True
            else:
                if not self.__setSoup():
                    log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                    return False
                while True:
                    try:
                        if not self.__getThreadPage():
                            break
##                        data = dict(parse_qsl(self.currenturi.split('?')[-1]))
##                        data['ctl00$ContentPlaceHolder1$ddlMessageCount'] = '20'
##                        data['ctl00$ContentPlaceHolder1$ddlOrder'] ='Desc'
##                        data['__EVENTTARGET'] = self.soup.find('a',id=re.compile('LinkButtonNext'))['id'].replace('_','$')
##                        jscript_arg = ['__EVENTVALIDATION','__VIEWSTATE']
##                        for each in jscript_arg:
##                            data[each] =  self.soup.find('input',id=each)['value']
                        self.currenturi = self.base_url +  self.soup.find('a',text='Next').parent['href']
                        if not self.__setSoup():
                            break
                    except:
                        log.info(self.log_msg('Next Page link not found'))
                        break
                if self.linksOut:
                    updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
                return True
        except:
            log.exception(self.log_msg('Exception in fetch'))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:57,代码来源:eggheadcafeconnector.py


示例13: fetch

 def fetch(self):
     """
     Fetch of http://forums.devx.com
     """
     self.genre="Review"
     try:
         
         self.parent_uri = self.currenturi
         log.info(self.parent_uri)
         self.currenturi =  self.__getStandUri(self.parent_uri)
         log.info(self.log_msg('The Standard Uri is'))
         log.info(self.parent_uri)
         if self.currenturi.startswith('http://forums.devx.com/showthread.'):
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type= True
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = self.__getStandUri('http://forums.devx.com/' + self.soup.find('a',text='&gt;').parent['href'])
                 except:
                     log.info(self.log_msg('Next page not set'))
                     break
                 if not self.__setSoup():
                     log.info(self.log_msg('cannot continue'))
                     break
             return True
         elif self.currenturi.startswith('http://forums.devx.com/forumdisplay'):
             self.total_posts_count = 0
             self.last_timestamp = datetime( 1980,1,1 )
             self.max_posts_count = int(tg.config.get(path='Connector',key='devxforum_numresults'))
             self.currenturi = self.currenturi + '&daysprune=-1&order=desc&sort=lastpost'
             log.info(self.log_msg('The link is:'))
             log.info(self.currenturi)
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             while True:
                 if not self.__getThreads():
                     break
                 try:
                     self.currenturi = self.__getStandUri('http://forums.devx.com/' + self.soup.find('a',text='&gt;').parent['href'])
                     if not self.__setSoup():
                         break
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
         else:
             log.info(self.log_msg('Url format is not recognized, Please verify the url'))
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:57,代码来源:devxconnector.py


示例14: fetch

 def fetch(self):
     """
     Fetch of polish forums
     sample uri :  http://www.mynextcollege.com/college-reviews/discussion-room-f6.html
     """
     self.genre="Review"
     try:
         self.parent_uri = self.currenturi
         self.currenturi = self.currenturi.split('-sid=')[0]
         if self.currenturi=='http://www.mynextcollege.com/college-reviews/':
             try:
                 if not self.__setSoup():
                     return False
                 self.__addFortumLinks()
             except:
                 log.info(self.log_msg('cannot add tasks'))
                 return False
         if re.match('.*?\-f\d+\.html$', self.currenturi):
             self.total_posts_count = 0
             self.last_timestamp = datetime( 1980,1,1 )
             self.max_posts_count = int(tg.config.get(path='Connector',key='mynextcollege_numresults'))
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             while True:
                 if not self.__getThreads():
                     break
                 try:
                     self.currenturi = 'http://www.mynextcollege.com/college-reviews' + self.soup.find('a',text='Next').parent['href'][1:].split('-sid=')[0]
                     if not self.__setSoup():
                         break
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
         else:
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type= True
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = 'http://www.mynextcollege.com/college-reviews' + self.soup.find('a',text='Next').parent['href'][1:].split('-sid=')[0]
                     if not self.__setSoup():
                         break
                 except:
                     log.info(self.log_msg('Next page not set'))
                     break
             return True
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:56,代码来源:mynextcollegeconnector.py


示例15: fetch

 def fetch(self):
     """
     Fetch of forum page
     """
     self.genre="Review"
     try:
         self.parent_uri = self.currenturi
         self.base_url = 'http://ocenbank.pl/forum/'
         if self.currenturi.startswith('http://ocenbank.pl/forum/viewforum'):
             self.total_posts_count = 0
             self.last_timestamp = datetime( 1980,1,1 )
             self.max_posts_count = int(tg.config.get(path='Connector',key='ocean_forum_numresults'))
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             next_page_no = 2
             while True:
                 if not self.__getThreads():
                     break
                 try:
                     self.currenturi = self.base_url + self.soup.find('p','pagelink conl').find('a',text=str(next_page_no)).parent['href']
                     if not self.__setSoup():
                         break
                     next_page_no = next_page_no + 1
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
         elif self.currenturi.startswith('http://ocenbank.pl/forum/viewtopic'):
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type = True
             next_page_no = 2
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = self.base_url + self.soup.find('p','pagelink conl').find('a',text=str(next_page_no)).parent['href']
                     if not self.__setSoup():
                         break
                     next_page_no = next_page_no + 1
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             return True
         else:
             log.info(self.log_msg('Wrong url is feeded'))
             log.info(self.log_msg('Hai+'+ self.currenturi))
             return False
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:55,代码来源:oceanconnector.py


示例16: fetch

 def fetch(self):
     """
     Fetch of http://forums.msexchange.org/Message_Routing/forumid_18/tt.htm
     """
     self.genre="Review"
     try:
         #self.currenturi ='http://forums.msexchange.org/Outlook_anywhere/m_1800490386/tm.htm'
         self.parent_uri = self.currenturi
         forum_id = self.currenturi.split('/')[-2]
         if forum_id.startswith('forumid'):
             self.total_posts_count = 0
             self.last_timestamp = datetime( 1980,1,1 )
             self.max_posts_count = int(tg.config.get(path='Connector',key='msexchange_forum_numresults'))
             self.currenturi = 'http://forums.msexchange.org/%s/p_1/tmode_1/smode_1/tt.htm'%forum_id
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             while True:
                 if not self.__getThreads():
                     break
                 try:
                     self.currenturi = 'http://forums.msexchange.org' + self.soup.find('a',text='next &gt;').findParent('a')['href']
                     if not self.__setSoup():
                         break
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
         else:
             #self.currenturi = 'http://forums.msexchange.org/%s/p_1/tmode_2/smode_1/tt.htm'%forum_id
             #headers = {'Referer':self.task.pagedata['Referer']}
             #log.info(headers)
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type= True
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = 'http://forums.msexchange.org' + self.soup.find('a',text='next &gt;').findParent('a')['href']
                 except:
                     log.info(self.log_msg('Next page not set'))
                     break
                 if not self.__setSoup():
                     log.info(self.log_msg('cannot continue'))
                     break
             return True
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:53,代码来源:msexchangeforumconnector.py


示例17: fetch

 def fetch(self):
     """
     Fetch of sql server central
     """
     self.genre="Review"
     try:
         self.parent_uri = self.currenturi
         self.total_posts_count = 0
         self.last_timestamp = datetime( 1980,1,1 )
         self.max_posts_count = int(tg.config.get(path='Connector',key='sqlservercentral_numresults'))
         self.hrefs_info = self.currenturi.split('/')
         if self.currenturi.startswith('http://www.sqlservercentral.com/Forums/Topic'):
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type = True
             next_page_no = 2
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = 'http://www.sqlservercentral.com/Forums/' +  self.soup.find('table',id= re.compile('FooterTable')).find('a',text=str(next_page_no)).parent['href']
                     if not self.__setSoup():
                         break
                     next_page_no = next_page_no + 1
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             return True
         else:
             self.currenturi = self.currenturi.replace('Default.aspx','afcol/0/afsort/DESC/Default.aspx')
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             next_page_no = 2
             while True:
                 if not self.__getThreadPage():
                     break
                 try:
                     self.currenturi = 'http://www.sqlservercentral.com/Forums/' +  self.soup.find('a',title='Next Page')['href']
                     if not self.__setSoup():
                         break
                     next_page_no = next_page_no + 1
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:52,代码来源:sqlservercentralconnector.py


示例18: fetch

    def fetch(self):
        """

        """
        self.genre="Review"
        try:
            self.parent_uri = self.currenturi
            if self.currenturi.startswith('http://able2know.org/topic/'):
                if not self.__setSoup():
                    log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                    return False
                self.__getParentPage()
                self.post_type= True
                while True:
                    self.__addPosts()
                    try:
                        self.currenturi = self.soup.find('a',title = 'Next Page')['href']
                    except:
                        log.info(self.log_msg('Next page not set'))
                        break
                    if not self.__setSoup():
                        log.info(self.log_msg('cannot continue'))
                        break
                return True
            elif self.currenturi.startswith('http://able2know.org/tag/'):
                self.total_posts_count = 0
                self.last_timestamp = datetime( 1980,1,1 )
                self.max_posts_count = int(tg.config.get(path='Connector',key='know_forum_numresults'))
                self.currenturi = self.currenturi 
                if not self.__setSoup():
                    log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                    return False
                count = 2
                while True:
                    if not self.__getThreads():
                        break
                    try:
                        self.currenturi = self.currenturi + self.soup.find('a',accesskey='n')['href'].lstrip('.')
                        if not self.__setSoup():
                            break
                    except:
                        log.info(self.log_msg('Next Page link not found'))
                        break
                    count = count+1
                if self.linksOut:
                    updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
                return True
            else:
                log.info(self.log_msg('Url format is not recognized, Please verify the url'))
        except:
            log.exception(self.log_msg('Exception in fetch'))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:52,代码来源:able2knowconnector.py


示例19: fetch

 def fetch(self):
     """www.petri.co.il
     """
     self.genre="Review"
     try:
         self.parent_uri = self.currenturi
         if self.currenturi.startswith('http://www.petri.co.il/forums/showthread'):
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type= True
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = 'http://www.petri.co.il/forums/' + self.soup.find('a',text='&gt;').parent['href']
                 except:
                     log.info(self.log_msg('Next page not set'))
                     break
                 if not self.__setSoup():
                     log.info(self.log_msg('cannot continue'))
                     break
             return True
         else:
             self.total_posts_count = 0
             self.last_timestamp = datetime( 1980,1,1 )
             try:
                 self.max_posts_count = int(tg.config.get(path='Connector',key='petri_max_threads_count'))
             except:
                 log.info(self.log_msg('max therads count not set'))
                 self.max_posts_count=5
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             while True:
                 if not self.__getThreads():
                     break
                 try:
                     self.currenturi = 'http://www.petri.co.il/forums/' + self.soup.find('a',text='&gt;').parent['href']
                     if not self.__setSoup():
                         break
                 except:
                     log.info(self.log_msg('Next Page link not found'))
                     break
             #self.linksOut=None
             if self.linksOut:
                 updateSessionInfo('Search', self.session_info_out,self.last_timestamp , None,'ForumThreadsPage', self.task.instance_data.get('update'))
             return True
     except:
         log.exception(self.log_msg('Exception in fetch'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:51,代码来源:petriconnector.py


示例20: fetch

 def fetch(self):
     """
     Fetch of
     """
     self.genre="Review"
     try:
         self.parent_uri = self.currenturi
         rcheck = re.compile(r'\d+-',re.U)
         
         #if self.currenturi.startswith('http://talk.collegeconfidential.com/college-admissions/'):
         if rcheck.search(self.currenturi):
             if not self.__setSoup():
                 log.info(self.log_msg('Soup not set , Returning False from Fetch'))
                 return False
             self.__getParentPage()
             self.post_type= True
             while True:
                 self.__addPosts()
                 try:
                     self.currenturi = '' + self.soup.find('a',text='&gt;').parent['href']
                 except:
                     log.info(self.log_msg('Next page not set'))
                     break
                  

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python shortcuts.error_page函数代码示例发布时间:2022-05-26
下一篇:
Python sessioninfomanager.checkSessionInfo函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap