• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python sessioninfomanager.checkSessionInfo函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.sessioninfomanager.checkSessionInfo函数的典型用法代码示例。如果您正苦于以下问题:Python checkSessionInfo函数的具体用法?Python checkSessionInfo怎么用?Python checkSessionInfo使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了checkSessionInfo函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __getParentPage

    def __getParentPage(self):
        """
        This will get the parent info
        """
        page = {}
        try:
            self.hierarchy =  page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:]
            page['title']= page['et_thread_hierarchy'][-1]
        except:
            log.info(self.log_msg('Thread hierarchy is not found'))
            page['title']=''
        try:
            self.thread_id =  page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx',''))
        except:
            log.info(self.log_msg('Thread id not found'))
        if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\
                                         self.task.instance_data.get('update')):
            log.info(self.log_msg('Session info return True, Already exists'))
            return False

        for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']:
            try:
                page[each] = self.task.pagedata[each]
            except:
                log.info(self.log_msg('page data cannot be extracted for %s'%each))
        try:
            post_hash = get_hash( page )
            id=None
            if self.session_info_out=={}:
                id=self.task.id
            result=updateSessionInfo( self.genre, self.session_info_out, self.\
                   parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id)
            if not result['updated']:
                return False
            page['path']=[self.parent_uri]
            page['parent_path']=[]
            page['uri'] = normalize( self.currenturi )
            page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
            page['priority']=self.task.priority
            page['level']=self.task.level
            page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['connector_instance_log_id'] = self.task.connector_instance_log_id
            page['connector_instance_id'] = self.task.connector_instance_id
            page['workspace_id'] = self.task.workspace_id
            page['client_id'] = self.task.client_id
            page['client_name'] = self.task.client_name
            page['last_updated_time'] = page['pickup_date']
            page['versioned'] = False
            page['data'] = ''
            page['task_log_id']=self.task.id
            page['entity'] = 'Post'
            page['category']=self.task.instance_data.get('category','')
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Parent Page added'))
            return True
        except :
            log.exception(self.log_msg("parent post couldn't be parsed"))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:teamsystemrocksconnector.py


示例2: __setParentPage

 def __setParentPage(self):
     """This will get the parent info
     """
     page = {}
     try:
         page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:]
         page['data'] = page['title'] = page['et_thread_hierarchy'][-1]
     except:
         log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\
                                                         %s'%self.currenturi))
         return
     if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return
     try:
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [self.task.instance_data['uri']] 
             page['parent_path'] = []
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['data'] = ''
             page['entity'] = 'thread'
             page.update(self.__task_elements_dict)
             page['posted_date'] = page['pickup_date']
             self.pages.append(page)
             log.info(self.log_msg('Parent Page Added'))
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg("parent post couldn't be parsed"))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:everydayhealthconnector.py


示例3: __addPost

    def __addPost(self, post):
        '''It will add the post
        '''
        try:
            
            page = self.__getData(post)
            if not page:
                return True
            unique_key  = get_hash( {'data' : page['data'] })
            if checkSessionInfo('review', self.session_info_out, unique_key,\
                         self.task.instance_data.get('update'),parent_list\
                                            = [self.currenturi]):
                log.info(self.log_msg('Session info returns True'))
                return False

            result=updateSessionInfo('review', self.session_info_out, unique_key, \
                get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                parent_list=[self.currenturi])
            if not result['updated']:
                log.info(self.log_msg('Update session info returns False'))
                return True
            page['path'] = [self.currenturi] 
            page['parent_path'] = []
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page['entity'] = 'post'
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Post Added'))
            return True
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:35,代码来源:bankguideconnector.py


示例4: __getThreads

 def __getThreads(self):
         """
         It will fetch each thread and its associate infomarmation
         and add the tasks
         """
         threads = [x.findParent('tr') for x in self.soup.findAll('span', 'topictitle')]
         if not threads:
             log.info(self.log_msg('No threads are found for url %s'%\
                                                         self.currenturi))
             return False
         for thread in threads:
             self.__total_threads_count += 1
             if self.__total_threads_count > self.__max_threads_count:
                 log.info(self.log_msg('Reaching maximum post,Return false \
                                         from the url %s'%self.currenturi))
                 return False
             try:
                 date_str = stripHtml(thread.findAll('td')[-1].renderContents()).splitlines()[0].strip()
                 thread_time = datetime.strptime(date_str,'%a %b %d, %Y %I:%M %p')
             except:
                 log.exception(self.log_msg('Cannot fetch the date for the url\
                                                         %s'%self.currenturi))
                 continue
             if checkSessionInfo('Search', self.session_info_out, thread_time,\
                                     self.task.instance_data.get('update')):
                     log.info(self.log_msg('Session info Returns True for url %s'%self.currenturi))
                     return False
             self.__last_timestamp = max(thread_time , self.__last_timestamp )
             try:
                 self.__links_to_process.append(self.__removeSessionId('http://www.blackberryblast.com/forums/' + thread.find('a', 'topictitle')['href'] ))
             except:
                 log.exception(self.log_msg('Cannot find the thread url \
                                         in the uri %s'%self.currenturi))
                 continue
         return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:35,代码来源:blackberryblastforumsconnector.py


示例5: __addPosts

    def __addPosts(self, links, parent_list):
        """Given a list of links to the discussion post, fetch the post contents and the author info
        """
        h = HTTPConnection()
        for link in links:
            try:
                page = {}
                object_id = re.search('objectID=(\d+)', link).group(1)
                link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id)
                # Using the redirected url instead of the url given by the search page
                self.currenturi = link
                page['uri'] = normalize(link)
                log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi)))
                if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                    self.task.instance_data.get('update'), parent_list=parent_list):
                    # No need to pick this page
                    continue
                res = self._getHTML()

                self.rawpage = res['result']
                self._setCurrentPage()
                # First try extracting from the post body
                if not self.__extractPostBody(page, object_id):
                    # if that fails, extract from the replies
                    self.__extractReplyBody(page, object_id)

            except:
                log.exception(self.log_msg("exception in extracting page"))
                continue
            page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ")

            checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest()
            id = None
            if self.session_info_out=={}:
                id = self.task.id
            result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                       checksum, 'Post', self.task.instance_data.get('update'),
                                       parent_list=parent_list, Id=id)
            if result['updated']:
                page['path'] =  page['parent_path'] = parent_list
                page['path'].append(self.currenturi)
                page['priority']=self.task.priority
                page['level']=self.task.level
                page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                page['connector_instance_log_id'] = self.task.connector_instance_log_id
                page['connector_instance_id'] = self.task.connector_instance_id
                page['workspace_id'] = self.task.workspace_id
                page['client_id'] = self.task.client_id  # TODO: Get the client from the project 
                page['client_name'] = self.task.client_name
                page['last_updated_time'] = page['pickup_date']
                page['versioned'] = False
                page['entity'] = 'Review'
                page['category'] = self.task.instance_data.get('category','')
                page['task_log_id']=self.task.id
                page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                

            # Calculate the hash and get the session info thingy
            self.pages.append(page)
        return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:vmwareconnector.py


示例6: __getSearchResults

    def __getSearchResults(self):
        '''It will fetch the search results and and add the tasks
        '''
        try:
            results = self.soup.findAll('dl','postprofile')
            for result in results:
                try:
                    if self.total_posts_count >= self.max_posts_count:
                        log.info(self.log_msg('Reaching maximum post,Return false'))
                        return False
                    self.total_posts_count = self.total_posts_count + 1
                    date_str =  stripHtml(result.find('dd').renderContents())
                    try:
                        thread_time = datetime.strptime(date_str,'%Y-%m-%d, %H:%M')
                    except:
                        log.info(self.log_msg('Cannot find the thread time, task not added '))
                        continue
                    if checkSessionInfo('search',self.session_info_out, thread_time,self.task.instance_data.get('update')) and self.max_posts_count >= self.total_posts_count:
                        log.info(self.log_msg('Session info return True'))
                        continue
                    self.last_timestamp = max(thread_time , self.last_timestamp )
                    temp_task=self.task.clone()
                    temp_task.instance_data[ 'uri' ] =  result.findAll('dd')[-3].find('a')['href']
                    log.info('taskAdded')
                    self.linksOut.append( temp_task )

                except:
                    log.exception(self.log_msg('task not added'))
            return True
        except:
            log.info(self.log_msg('cannot get the search results'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:31,代码来源:bankoweforumconnector.py


示例7: __getParentPage

 def __getParentPage(self,comment):
     """This will get the parent info
     """
     page = {}
     try:
         self.__total_replies_count = page['ei_data_replies_count'] = int(stripHtml(comment.find('totalreplies').renderContents()))
         page['title'] = page['data'] = stripHtml(comment.find('name').renderContents())
         page['posted_date'] = stripHtml(comment.find('dateadded').renderContents()).split('.')[0]
         unique_key = stripHtml(comment.find('messageid').renderContents())
         if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],\
                                      self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info return True, Already exists'))
             return
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path']=[unique_key] 
             page['parent_path']=[]
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['entity'] = 'post'
             page.update(self.__task_elements_dict)
             log.info(page['data'])
             self.pages.append(page)
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg('Hierachy/Title not found in url %s'%self.currenturi))
         return
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:ignconnector.py


示例8: __addPost

 def __addPost(self, post, is_question=False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key = post.find('a', attrs={'name':True})['name']
         permalink = self.currenturi + '#' + unique_key
         if checkSessionInfo(self.__genre, self.session_info_out, \
                     unique_key, self.task.instance_data.get('update'),\
                     parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                             permalink))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:livestrongconnector.py


示例9: __addPost

 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('span', attrs={'class': 'name'}).\
                      find('a')['name']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s' % unique_key))
             return False
         page = self.__getData(post, is_question)
         log.info(self.log_msg('page'))
         if not page:
             log.info(self.log_msg('page contains empty data __getData returns False \
                         for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, 
                 unique_key, get_hash( page ),'forum', self.task.\
                 instance_data.get('update'), parent_list = \
                 [ self.task.instance_data['uri'] ] )
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = self.currenturi 
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s' \
             % self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:pjnetconnector.py


示例10: __fetchTopic

 def __fetchTopic(self):
     """
     """
     try:
         headers = {}
         headers['Accept-encoding'] = ''
         headers['Accept-Language'] = 'en-US,en;q=0.8'
         res=self._getHTML(self.currenturi,headers=headers)
         self.rawpage=res['result']
         self._setCurrentPage()
         try:
             post_hash= self.currenturi
         except:
             log.debug(self.log_msg("Error occured while creating the parent page hash"))
             return False
         if not checkSessionInfo(self.genre, self.session_info_out, 
                                 self.task.instance_data['uri'], self.task.instance_data.get('update')):
             id=None
             if self.session_info_out=={}:
                 id=self.task.id
                 log.debug('got the connector instance first time, sending updatesessioninfo the id : %s' % str(id))
                 result=updateSessionInfo(self.genre, self.session_info_out, self.task.instance_data['uri'], post_hash, 
                                          'Post', self.task.instance_data.get('update'), Id=id)
         return True
     except:
         log.exception(self.log_msg("Error occured while processing %s"%(self.currenturi)))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:facebookconnector.py


示例11: __processRSSFeeds

 def __processRSSFeeds(self):
     '''This will process the RSS Feeds of Facebook
     '''
     log.debug(self.log_msg("Entry Webpage: "+str(self.currenturi)))
     parser = feedparser.parse(self.currenturi)
     if len(parser.version) == 0 or not parser:
         log.info(self.log_msg('parser version not found , returning'))
         return False
     log.info('number of entries %s'%(len(parser.entries)))
     for entity in parser.entries:
         try:
             if checkSessionInfo('Review',self.session_info_out, entity['link'],
                                     self.task.instance_data.get('update')):
                 log.info(self.log_msg('Session info returns  True for uri %s'%entity['link']))
                 continue
             result = updateSessionInfo('Review', self.session_info_out, entity['link'], '',
                                       'Post', self.task.instance_data.get('update'))
             if not result['updated']:
                 log.info(self.log_msg('Result not updated for uri %s'%entity['link']))
                 continue
             temp_task = self.task.clone()
             temp_task.instance_data['uri'] = normalize(entity['link'])
             temp_task.pagedata['title'] = entity['title']
             temp_task.pagedata['source'] = 'facebook.com'
             temp_task.instance_data['connector_name'] = 'HTMLConnector'
             temp_task.pagedata['source_type'] = 'rss'
             self.linksOut.append(temp_task)
         except:
             log.exception(self.log_msg("exception in adding temptask to linksout"))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:facebookconnector.py


示例12: __addPost

 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('a')['name'].replace('Post','')
         log.debug(self.log_msg('POST: ' + str(unique_key)))
         if checkSessionInfo('review', self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                             %unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             unique_key,get_hash( page ),'forum', self.task.instance_data.get\
                 ('update'),parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['parent_path'] = [self.task.instance_data['uri']]
             page['uri']= self.currenturi + '#' + unique_key
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             #page['entity'] = ''
             #log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:baliforumconnector.py


示例13: __addPosts

 def __addPosts(self, post):
     '''It will add the post
     '''
     try:
         unique_key = post['id'].split('_')[-1]
         if checkSessionInfo('review', self.session_info_out, unique_key, \
                         self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                            %self.currenturi))
             return False
         page = self.__getData(post)
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             unique_key,get_hash( page ),'review', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [ self.currenturi, unique_key]
             page['parent_path'] = []
             if not page.get('uri'):
                 page['uri']= self.currenturi + '#' + unique_key
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page['entity'] = 'review'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                             url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:ciaoconnector.py


示例14: __addPost

 def __addPost(self, post,is_question=False):
     '''This will add the post
     '''
     try:
         page = self.__getData(post,is_question)
         if not page:
             log.info(self.log_msg('No data found in url %s'%self.currenturi))        
             return True
         unique_key = get_hash({'data':page['data'], 'title':page['title']})
         if checkSessionInfo(self.__genre, self.session_info_out, \
                 unique_key, self.task.instance_data.get('update'),\
                 parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                         self.currenturi))
             return False            
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post in url %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:ehealthforumconnector.py


示例15: __addPost

    def __addPost(self, post, is_original_post=False):
        try:
            unique_key = stripHtml(str(post.findAll('div', 'oneLine')[2])).split()[2]

            page = self.__get_data(post, is_original_post, unique_key)
            if not page: 
                log.info(self.log_msg('page is empty, __get_data returns  False for uri %s' % 
                                      self.currenturi))
                return True

            if checkSessionInfo(self.__genre, self.session_info_out, 
                                unique_key, self.task.instance_data.get('update'), 
                                parent_list=[self.task.instance_data['uri']]):
                log.info(self.log_msg('Session info returns True for uri %s' % 
                                      self.task.instance_data['uri']))
                return False

            result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, 
                                       get_hash(page),'forum', self.task.instance_data.get('update'), 
                                       parent_list=[self.task.instance_data['uri']])
            if result['updated']:
                page['parent_path'] = [self.task.instance_data['uri']]
                page['path'] = [self.task.instance_data['uri'], unique_key]
                page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
                page.update(self.__task_elements_dict)
                self.pages.append(page)
            else:
                log.info(self.log_msg('Update session info returns False for url %s' % self.currenturi))
        except:
            log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi))

        return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:ivillageconnector.py


示例16: __getThreads

 def __getThreads(self):
     """Get the threads on the current page"""
     try:
         threads = self.soup.find('table', attrs={'class': 'forumline'}).\
                   findAll('tr', recursive=False)[1:-1]
         if not threads:
             log.info(self.log_msg('No threads found for url %s'%\
                                                         self.currenturi))
             return False
     except:
         log.info(self.log_msg('exception while getting threads'))
         return False
     for thread in threads:
         if thread.find('b', text = 'Announcement:'):
             continue
         if thread.find('b', text = 'Sticky:'):
             continue
         if self.__thread_count >= self.__max_threads:
             log.info(self.log_msg('Reaching maximum post,Return false at \
                 the url %s' % self.currenturi))
             return False
         try:
             thread_time = self.__processTime(thread.findAll('span', attrs={'class': 'postdetails'})[-1].contents[0])
         except:
             log.exception(self.log_msg('date not found in %s' % self.currenturi))
         self.__thread_count += 1
         if checkSessionInfo('Search', self.session_info_out, thread_time, self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info Returns True for %s' % self.currenturi))
             return False
         self.__last_timestamp = max(thread_time, self.__last_timestamp)
         temp_task = self.task.clone()                    
         try:
             temp_task.instance_data[ 'uri' ] = self.__baseuri + thread.find('a', attrs={'class': 'topictitle'})['href']
         except:
             log.exception(self.log_msg('Cannot find the thread url \
                                     in the uri %s'%self.currenturi))
             continue
         try:
             temp_task.pagedata['et_thread_author'] = thread.find('span', attrs={'class': 'name'}).find('a').renderContents()
         except:
             log.info(self.log_msg('Exception raised when getting thread data from %s' % self.currenturi))
         try:
             lp_tag = thread.findAll('span', attrs={'class': 'postdetails'})[-1]
             temp_task.pagedata['edate_last_post_date'] = datetime.strftime(self.__processTime(lp_tag.contents[0]), "%Y-%m-%dT%H:%M:%SZ")
             temp_task.pagedata['et_thread_last_post_author'] = stripHtml(thread.find('a').renderContents())
         except:
             log.exception(self.log_msg('Exception raised when getting last\
                  post data from %s' % self.currenturi))
         try:
             temp_task.pagedata['ei_thread_replies_count'] = int(thread.findAll('td', recursive=False)[2].find('span').renderContents())
         except:
             log.info(self.log_msg('Replies count not found in the url %s' \
                 % self.currenturi))
         try:
             temp_task.pagedata['ei_thread_views_count'] = int(thread.findAll('td', recursive=False)[4].find('span').renderContents())
         except:
             log.info(self.log_msg('Views count not found in the url %s' % \
                 self.currenturi))
         self.linksOut.append(temp_task)
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:pjnetconnector.py


示例17: __getSearchForumResults

 def __getSearchForumResults(self):
     '''It will fetch the search results and and add the tasks
     '''
     try:
         results = self.soup.findAll('div','eachResult')
         log.info(self.log_msg('Total Results found is %d'%len(results)))
         for result in results:
             try:
                 if self.total_posts_count >= self.max_posts_count:
                     log.info(self.log_msg('Reaching maximum post,Return false'))
                     return False
                 self.total_posts_count = self.total_posts_count + 1
                 date_str = stripHtml(result.find('span','grayText12').renderContents())
                 try:
                     thread_time = datetime.strptime(date_str, '%Y.%m.%d')
                 except:
                     log.info(self.log_msg('Cannot find the thread time, task not added '))
                     continue
                 if checkSessionInfo('search',self.session_info_out, thread_time,self.task.instance_data.get('update')) and self.max_posts_count >= self.total_posts_count:
                     log.info(self.log_msg('Session info return True or Reaches max count'))
                     return False
                 self.last_timestamp = max(thread_time , self.last_timestamp )
                 temp_task=self.task.clone()
                 temp_task.instance_data[ 'uri' ] = result.find('span','linkedBlueText13').find('a')['href']
                 log.info('taskAdded')
                 self.linksOut.append( temp_task )
             except:
                 log.exception(self.log_msg('task not added'))
                 continue
         return True
     except:
         log.exception(self.log_msg('cannot get the search results'))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:chosunconnector.py


示例18: __addPost

 def __addPost(self, post, is_question = False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:  
         unique_tag = post.find('a', 'postcounter')
        #is_question = stripHtml(unique_tag.renderContents())== u'#1'
         unique_key = unique_tag['href']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:everythingberryconnector.py


示例19: __getParentPage

 def __getParentPage(self):
     ''
     if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return False
     page = {}
     try:
         page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()=='']
         page['title']= page['et_thread_hierarchy'][-1]
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
         page['title']=''
     for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']:
         try:
             page[each] = self.task.pagedata[each]
         except:
             log.info(self.log_msg('page data cannot be extracted'))
     try:
         page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1]
     except:
         log.info(self.log_msg('Thread id not found'))
         
     try:
         post_hash = get_hash( page )
         id=None
         if self.session_info_out=={}:
             id=self.task.id
         result=updateSessionInfo( self.genre, self.session_info_out, self.\
                currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id)
         if not result['updated']:
             return False
         page['path']=[self.currenturi]
         page['parent_path']=[]
         page['uri'] = normalize( self.currenturi )
         page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
         page['priority']=self.task.priority
         page['level']=self.task.level
         page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['connector_instance_log_id'] = self.task.connector_instance_log_id
         page['connector_instance_id'] = self.task.connector_instance_id
         page['workspace_id'] = self.task.workspace_id
         page['client_id'] = self.task.client_id
         page['client_name'] = self.task.client_name
         page['last_updated_time'] = page['pickup_date']
         page['versioned'] = False
         #page['first_version_id']=result['first_version_id']
         page['data'] = ''
         #page['id'] = result['id']
         page['task_log_id']=self.task.id
         page['entity'] = 'Post'
         page['category']=self.task.instance_data.get('category','')
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Parent Page added'))
         return True
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:aspmessageboardconnector.py


示例20: __addPost

该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python sessioninfomanager.updateSessionInfo函数代码示例发布时间:2022-05-26
下一篇:
Python sd_backend.get_sd_backend函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap