• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python urlnorm.normalize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.urlnorm.normalize函数的典型用法代码示例。如果您正苦于以下问题:Python normalize函数的具体用法?Python normalize怎么用?Python normalize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了normalize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _addLinksToCrawler

    def _addLinksToCrawler(self):
        """
        """
        try:
            log.info(self.log_msg('levels : %s , %s:%s:%s'%(self.currenturi,self.task.level,self.level,self.max_recursion_level)))
            if self.task.level > self.max_recursion_level and not self.task.instance_data.get('metapage'):
                log.debug('TaskID:%s::Client:%s::recursion level greater then MAX, returning for %s' % (self.task.id, self.task.client_name,self.currenturi))
                return

            #increment=1
            #if self.task.instance_data['metapage']:
                #increment=0

            for anchor in self.soup.findAll('a',href=True):
                try:
                    url = normalize(unicode(anchor['href']), self.currenturi, self.base)
                    #apply regex patters to urls :
                    if self.task.instance_data.get('url_filter'):
                        url_pattern = re.compile(self.task.instance_data['url_filter'],
                                                 re.IGNORECASE|re.DOTALL)
                        if not url_pattern.search(url):
                            continue
                    log.info(self.log_msg("clone uri :: %s"%normalize(unicode(anchor['href']), self.currenturi, self.base)))
                    temp_task=self.task.clone()
                    temp_task.instance_data['uri']=normalize(unicode(anchor['href']), self.currenturi, self.base)
                    #temp_task.level=int(self.task.level)+increment
                    temp_task.pagedata['title']=getTitleFromLink(anchor)
                    temp_task.priority=self.task.priority
                    self.linksOut.append(temp_task)
                except:
                    log.exception('TaskID:%s::Client:%s::failed to create one of the clone tasks' % (self.task.id, self.task.client_name))
                    continue
            return True #intentional indentation
        except:
            log.exception('TaskID:%s::Client:%s::addLinksToCrawler failed' % (self.task.id, self.task.client_name))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:35,代码来源:htmlconnector.py


示例2: _process_item

 def _process_item(self, item):
   feed_title = item.xpath('./string[@name="title"]') and \
                         item.xpath('./string[@name="title"]')[0].text
   feed_address = item.xpath('./string[@name="id"]') and \
                   item.xpath('./string[@name="id"]')[0].text.replace('feed/', '', 1)
   feed_link = item.xpath('./string[@name="htmlUrl"]') and \
                   item.xpath('./string[@name="htmlUrl"]')[0].text
   category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \
                   item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
   
   if not feed_address:
       feed_address = feed_link
 
   try:
     feed_link = urlnorm.normalize(feed_link)
     feed_address = urlnorm.normalize(feed_address)
 
     feed = {
         'title': feed_title,
         'url': feed_address,
         'link': feed_link,
         'category': category,
         }
     return feed
 
   except Exception, e:
     print '---->Exception: %s: %s' % (e, item)
开发者ID:hpsoar,项目名称:reader4you,代码行数:27,代码来源:rss_importer.py


示例3: process_outline

    def process_outline(self, outline):
        folders = []
        for item in outline:
            if not hasattr(item, 'xmlUrl') and hasattr(item, 'text'):
                folder = item
                # if hasattr(folder, 'text'):
                #     logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))
                folders.append({folder.text: self.process_outline(folder)})
            elif hasattr(item, 'xmlUrl'):
                feed = item
                if not hasattr(feed, 'htmlUrl'):
                    setattr(feed, 'htmlUrl', None)
                # If feed title matches what's in the DB, don't override it on subscription.
                feed_title = getattr(feed, 'title', None) or getattr(feed, 'text', None)
                if not feed_title:
                    setattr(feed, 'title', feed.htmlUrl or feed.xmlUrl)
                    user_feed_title = None
                else:
                    setattr(feed, 'title', feed_title)
                    user_feed_title = feed.title

                feed_address = urlnorm.normalize(feed.xmlUrl)
                feed_link = urlnorm.normalize(feed.htmlUrl)
                if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
                    continue
                if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length:
                    continue
                # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))
                feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
                # feeds.append(feed_data)

                # See if it exists as a duplicate first
                duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
                if duplicate_feed:
                    feed_db = duplicate_feed[0].feed
                else:
                    feed_data['active_subscribers'] = 1
                    feed_data['num_subscribers'] = 1
                    feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,
                                                            feed_link=feed_link,
                                                            defaults=dict(**feed_data))

                if user_feed_title == feed_db.feed_title:
                    user_feed_title = None

                us, _ = UserSubscription.objects.get_or_create(
                    feed=feed_db, 
                    user=self.user,
                    defaults={
                        'needs_unread_recalc': True,
                        'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
                        'active': self.user.profile.is_premium,
                        'user_title': user_feed_title
                    }
                )
                if self.user.profile.is_premium and not us.active:
                    us.active = True
                    us.save()
                folders.append(feed_db.pk)
        return folders
开发者ID:JWegener,项目名称:NewsBlur,代码行数:60,代码来源:models.py


示例4: get_or_create

  def get_or_create(cls, address, title='', link=''):
    address = urlnorm.normalize(address)
    link = link and urlnorm.normalize(link)

    feed = cls.get_by_url(address)
    if feed: return feed, True
    feed = Feed(address, title = title, link = link)
    feed.save()
    return feed.update(), False
开发者ID:hpsoar,项目名称:reader4you,代码行数:9,代码来源:feed.py


示例5: process_item

    def process_item(self, item, folders):
        feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text
        feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace(
            "feed/", ""
        )
        feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text
        category = (
            item.xpath('./list[@name="categories"]/object/string[@name="label"]')
            and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
        )

        if not feed_address:
            feed_address = feed_link

        try:
            feed_link = urlnorm.normalize(feed_link)
            feed_address = urlnorm.normalize(feed_address)

            if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
                return folders

            # See if it exists as a duplicate first
            duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
            if duplicate_feed:
                feed_db = duplicate_feed[0].feed
            else:
                feed_data = dict(feed_title=feed_title)
                feed_data["active_subscribers"] = 1
                feed_data["num_subscribers"] = 1
                feed_db, _ = Feed.find_or_create(
                    feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)
                )

            us, _ = UserSubscription.objects.get_or_create(
                feed=feed_db,
                user=self.user,
                defaults={
                    "needs_unread_recalc": True,
                    "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
                    "active": self.user.profile.is_premium or self.auto_active,
                },
            )
            if not us.needs_unread_recalc:
                us.needs_unread_recalc = True
                us.save()
            if not category:
                category = ""

            if category:
                obj = {category: []}
                folders = add_object_to_folder(obj, "", folders)
            folders = add_object_to_folder(feed_db.pk, category, folders)
            # if feed_db.pk not in folders[category]:
            #     folders[category].append(feed_db.pk)
        except Exception, e:
            logging.info(" *** -> Exception: %s: %s" % (e, item))
开发者ID:bruceyou,项目名称:NewsBlur,代码行数:56,代码来源:models.py


示例6: process_item

    def process_item(self, item, folders):
        feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text
        feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace(
            "feed/", ""
        )
        feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text
        category = (
            item.xpath('./list[@name="categories"]/object/string[@name="label"]')
            and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
        )

        if not feed_address:
            feed_address = feed_link

        try:
            feed_link = urlnorm.normalize(feed_link)
            feed_address = urlnorm.normalize(feed_address)

            if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
                return folders

            # See if it exists as a duplicate first
            duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
            if duplicate_feed:
                feed_db = duplicate_feed[0].feed
            else:
                feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title)
                feed_data["active_subscribers"] = 1
                feed_data["num_subscribers"] = 1
                feeds = Feed.objects.filter(feed_address=feed_address, branch_from_feed__isnull=True).order_by(
                    "-num_subscribers"
                )
                if feeds:
                    feed_db = feeds[0]
                else:
                    feed_db = Feed.objects.create(**feed_data)

            us, _ = UserSubscription.objects.get_or_create(
                feed=feed_db,
                user=self.user,
                defaults={
                    "needs_unread_recalc": True,
                    "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
                    "active": self.user.profile.is_premium,
                },
            )
            if not category:
                category = "Root"
            folders[category].append(feed_db.pk)
        except Exception, e:
            logging.info(" *** -> Exception: %s" % e)
开发者ID:rcpsec,项目名称:NewsBlur,代码行数:51,代码来源:models.py


示例7: process_outline

    def process_outline(self, outline):
        folders = []
        for item in outline:
            if not hasattr(item, "xmlUrl"):
                folder = item
                # if hasattr(folder, 'text'):
                #     logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))
                folders.append({folder.text: self.process_outline(folder)})
            elif hasattr(item, "xmlUrl"):
                feed = item
                if not hasattr(feed, "htmlUrl"):
                    setattr(feed, "htmlUrl", None)
                if not hasattr(feed, "title") or not feed.title:
                    setattr(feed, "title", feed.htmlUrl or feed.xmlUrl)
                feed_address = urlnorm.normalize(feed.xmlUrl)
                feed_link = urlnorm.normalize(feed.htmlUrl)
                if len(feed_address) > Feed._meta.get_field("feed_address").max_length:
                    continue
                if feed_link and len(feed_link) > Feed._meta.get_field("feed_link").max_length:
                    continue
                if len(feed.title) > Feed._meta.get_field("feed_title").max_length:
                    feed.title = feed.title[:255]
                # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))
                feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
                # feeds.append(feed_data)

                # See if it exists as a duplicate first
                duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
                if duplicate_feed:
                    feed_db = duplicate_feed[0].feed
                else:
                    feed_data["active_subscribers"] = 1
                    feed_data["num_subscribers"] = 1
                    feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data))

                us, _ = UserSubscription.objects.get_or_create(
                    feed=feed_db,
                    user=self.user,
                    defaults={
                        "needs_unread_recalc": True,
                        "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1),
                        "active": self.user.profile.is_premium,
                    },
                )
                if self.user.profile.is_premium and not us.active:
                    us.active = True
                    us.save()
                folders.append(feed_db.pk)
        return folders
开发者ID:vvarp,项目名称:NewsBlur,代码行数:49,代码来源:models.py


示例8: process_item

    def process_item(self, item, folders):
        feed_title = item.xpath('./string[@name="title"]') and \
                        item.xpath('./string[@name="title"]')[0].text
        feed_address = item.xpath('./string[@name="id"]') and \
                        item.xpath('./string[@name="id"]')[0].text.replace('feed/', '')
        feed_link = item.xpath('./string[@name="htmlUrl"]') and \
                        item.xpath('./string[@name="htmlUrl"]')[0].text
        category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \
                        item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
        
        if not feed_address:
            feed_address = feed_link
        
        try:
            feed_link = urlnorm.normalize(feed_link)
            feed_address = urlnorm.normalize(feed_address)

            if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
                return folders

            # See if it exists as a duplicate first
            duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
            if duplicate_feed:
                feed_db = duplicate_feed[0].feed
            else:
                feed_data = dict(feed_title=feed_title)
                feed_data['active_subscribers'] = 1
                feed_data['num_subscribers'] = 1
                feed_db, _ = Feed.find_or_create(feed_address=feed_address, feed_link=feed_link,
                                                 defaults=dict(**feed_data))

            us, _ = UserSubscription.objects.get_or_create(
                feed=feed_db, 
                user=self.user,
                defaults={
                    'needs_unread_recalc': True,
                    'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
                    'active': self.user.profile.is_premium or self.auto_active,
                }
            )
            if not us.needs_unread_recalc:
                us.needs_unread_recalc = True
                us.save()
            if not category: category = "Root"
            if feed_db.pk not in folders[category]:
                folders[category].append(feed_db.pk)
        except Exception, e:
            logging.info(' *** -> Exception: %s: %s' % (e, item))
开发者ID:0077cc,项目名称:NewsBlur,代码行数:48,代码来源:models.py


示例9: fetch

 def fetch(self):
     """
     same fetch method, I need to write something for doc string
     So I m writing this doc string
     """
     try:
         self.parenturi = self.currenturi
         self.genre = "Review"
         if self.currenturi == 'http://www.laptopical.com/laptop-reviews.html':
             if not self._setSoup():
                 return False
             hrefs = [ 'http://www.laptopical.com' + div.find('a')['href'] \
                     for div in self.soup.find('div',{'id':'review-listing'})\
                     .find('ul').findAll('li') if not div.find('a') == None ]
             for href in hrefs:
                 temp_task=self.task.clone()
                 temp_task.instance_data[ 'uri' ] = normalize( href )
                 self.linksOut.append( temp_task )
             log.info('Total uris are %d'%(len( hrefs )))
             return True
         if re.compile('http://www.laptopical.com/.+?\.html').match(self.currenturi):
             if not self._setSoup():
                 return False
             self._getParentPage()
             self._addReview()
             return True
     except:
         log.exception('error in fetch ')
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:29,代码来源:laptopicalconnector.py


示例10: __processRSSFeeds

 def __processRSSFeeds(self):
     '''This will process the RSS Feeds of Facebook
     '''
     log.debug(self.log_msg("Entry Webpage: "+str(self.currenturi)))
     parser = feedparser.parse(self.currenturi)
     if len(parser.version) == 0 or not parser:
         log.info(self.log_msg('parser version not found , returning'))
         return False
     log.info('number of entries %s'%(len(parser.entries)))
     for entity in parser.entries:
         try:
             if checkSessionInfo('Review',self.session_info_out, entity['link'],
                                     self.task.instance_data.get('update')):
                 log.info(self.log_msg('Session info returns  True for uri %s'%entity['link']))
                 continue
             result = updateSessionInfo('Review', self.session_info_out, entity['link'], '',
                                       'Post', self.task.instance_data.get('update'))
             if not result['updated']:
                 log.info(self.log_msg('Result not updated for uri %s'%entity['link']))
                 continue
             temp_task = self.task.clone()
             temp_task.instance_data['uri'] = normalize(entity['link'])
             temp_task.pagedata['title'] = entity['title']
             temp_task.pagedata['source'] = 'facebook.com'
             temp_task.instance_data['connector_name'] = 'HTMLConnector'
             temp_task.pagedata['source_type'] = 'rss'
             self.linksOut.append(temp_task)
         except:
             log.exception(self.log_msg("exception in adding temptask to linksout"))
     return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:facebookconnector.py


示例11: fetch

    def fetch(self):

        self.genre="Review"
        try:
            self.__base_uri = 'http://answers.yahoo.com/'
            code = None
            parent_uri = self.currenturi
            res=self._getHTML()
            self.rawpage=res['result']
            self._setCurrentPage()
            self.POSTS_ITERATIONS = tg.config.get(path='Connector',key='yahooanswers_numposts')
            self.__max_date_submission_date = tg.config.get(path='Connector',key='yahooanswers_max_date_submission')
            self.curiter = 0
            if '/question/index' not in self.currenturi:
                self.__createSiteUrl()
                next_page = self.soup.find('li',{'class':'next'})
                while self.addQuestionUrls(parent_uri) and next_page:
                    try:
                        self.currenturi = normalize(self.__base_uri + next_page.a['href'])
                        log.debug(self.log_msg("Fetching url %s" %(self.currenturi)))
                        res=self._getHTML()
                        self.rawpage=res['result']
                        self._setCurrentPage()
                        next_page = self.soup.find('li',{'class':'next'})
                    except Exception, e:
                        log.exception(self.log_msg('exception in iterating pages in fetch'))
                        break
            else:
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:yahooanswersconnector.py


示例12: __getParentPage

    def __getParentPage(self):
        """
        This will get the parent info
        """
        page = {}
        try:
            self.hierarchy =  page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:]
            page['title']= page['et_thread_hierarchy'][-1]
        except:
            log.info(self.log_msg('Thread hierarchy is not found'))
            page['title']=''
        try:
            self.thread_id =  page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx',''))
        except:
            log.info(self.log_msg('Thread id not found'))
        if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\
                                         self.task.instance_data.get('update')):
            log.info(self.log_msg('Session info return True, Already exists'))
            return False

        for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']:
            try:
                page[each] = self.task.pagedata[each]
            except:
                log.info(self.log_msg('page data cannot be extracted for %s'%each))
        try:
            post_hash = get_hash( page )
            id=None
            if self.session_info_out=={}:
                id=self.task.id
            result=updateSessionInfo( self.genre, self.session_info_out, self.\
                   parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id)
            if not result['updated']:
                return False
            page['path']=[self.parent_uri]
            page['parent_path']=[]
            page['uri'] = normalize( self.currenturi )
            page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
            page['priority']=self.task.priority
            page['level']=self.task.level
            page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['connector_instance_log_id'] = self.task.connector_instance_log_id
            page['connector_instance_id'] = self.task.connector_instance_id
            page['workspace_id'] = self.task.workspace_id
            page['client_id'] = self.task.client_id
            page['client_name'] = self.task.client_name
            page['last_updated_time'] = page['pickup_date']
            page['versioned'] = False
            page['data'] = ''
            page['task_log_id']=self.task.id
            page['entity'] = 'Post'
            page['category']=self.task.instance_data.get('category','')
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Parent Page added'))
            return True
        except :
            log.exception(self.log_msg("parent post couldn't be parsed"))
            return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:teamsystemrocksconnector.py


示例13: api_save_new_subscription

def api_save_new_subscription(request):
    user = request.user
    body = request.body_json
    fields = body.get('actionFields')
    url = urlnorm.normalize(fields['url'])
    folder = fields['folder']
    
    if folder == "Top Level":
        folder = " "
    
    code, message, us = UserSubscription.add_subscription(
        user=user, 
        feed_address=url,
        folder=folder,
        bookmarklet=True
    )
    
    logging.user(request, "~FRAdding URL from ~FC~SBIFTTT~SN~FR: ~SB%s (in %s)" % (url, folder))

    if us and us.feed:
        url = us.feed.feed_address

    return {"data": [{
        "id": us and us.feed_id,
        "url": url,
    }]}
开发者ID:76,项目名称:NewsBlur,代码行数:26,代码来源:views.py


示例14: __getParentPage

 def __getParentPage(self):
     ''
     if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return False
     page = {}
     try:
         page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()=='']
         page['title']= page['et_thread_hierarchy'][-1]
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
         page['title']=''
     for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']:
         try:
             page[each] = self.task.pagedata[each]
         except:
             log.info(self.log_msg('page data cannot be extracted'))
     try:
         page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1]
     except:
         log.info(self.log_msg('Thread id not found'))
         
     try:
         post_hash = get_hash( page )
         id=None
         if self.session_info_out=={}:
             id=self.task.id
         result=updateSessionInfo( self.genre, self.session_info_out, self.\
                currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id)
         if not result['updated']:
             return False
         page['path']=[self.currenturi]
         page['parent_path']=[]
         page['uri'] = normalize( self.currenturi )
         page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
         page['priority']=self.task.priority
         page['level']=self.task.level
         page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['connector_instance_log_id'] = self.task.connector_instance_log_id
         page['connector_instance_id'] = self.task.connector_instance_id
         page['workspace_id'] = self.task.workspace_id
         page['client_id'] = self.task.client_id
         page['client_name'] = self.task.client_name
         page['last_updated_time'] = page['pickup_date']
         page['versioned'] = False
         #page['first_version_id']=result['first_version_id']
         page['data'] = ''
         #page['id'] = result['id']
         page['task_log_id']=self.task.id
         page['entity'] = 'Post'
         page['category']=self.task.instance_data.get('category','')
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Parent Page added'))
         return True
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
         return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:aspmessageboardconnector.py


示例15: api_share_new_story

def api_share_new_story(request):
    user = request.user
    body = request.body_json
    fields = body.get('actionFields')
    story_url = urlnorm.normalize(fields['story_url'])
    content = fields.get('story_content', "")
    story_title = fields.get('story_title', "[Untitled]")
    story_author = fields.get('story_author', "")
    comments = fields.get('comments', None)

    feed = Feed.get_feed_from_url(story_url, create=True, fetch=True)
    
    content = lxml.html.fromstring(content)
    content.make_links_absolute(story_url)
    content = lxml.html.tostring(content)
    
    shared_story = MSharedStory.objects.filter(user_id=user.pk,
                                               story_feed_id=feed and feed.pk or 0,
                                               story_guid=story_url).limit(1).first()
    if not shared_story:
        story_db = {
            "story_guid": story_url,
            "story_permalink": story_url,
            "story_title": story_title,
            "story_feed_id": feed and feed.pk or 0,
            "story_content": content,
            "story_author": story_author,
            "story_date": datetime.datetime.now(),
            "user_id": user.pk,
            "comments": comments,
            "has_comments": bool(comments),
        }
        shared_story = MSharedStory.objects.create(**story_db)
        socialsubs = MSocialSubscription.objects.filter(subscription_user_id=user.pk)
        for socialsub in socialsubs:
            socialsub.needs_unread_recalc = True
            socialsub.save()
        logging.user(request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments))
    else:
        logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments))
    
    try:
        socialsub = MSocialSubscription.objects.get(user_id=user.pk, 
                                                    subscription_user_id=user.pk)
    except MSocialSubscription.DoesNotExist:
        socialsub = None
    
    if socialsub:
        socialsub.mark_story_ids_as_read([shared_story.story_hash], 
                                          shared_story.story_feed_id, 
                                          request=request)
    else:
        RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash)

    shared_story.publish_update_to_subscribers()
    
    return {"data": [{
        "id": shared_story and shared_story.story_guid,
        "url": shared_story and shared_story.blurblog_permalink()
    }]}
开发者ID:magnet0,项目名称:NewsBlur,代码行数:60,代码来源:views.py


示例16: get_by_url

 def get_by_url(cls, url):
   url = urlnorm.normalize(url)
   feeds = cls.filter({'feed_address': url})
   if feeds and len(feeds) > 0: return feeds[0]
   feeds = cls.filter({'feed_link': url})
   if feeds and len(feeds) > 0: return feeds[0]
   return None
开发者ID:hpsoar,项目名称:reader4you,代码行数:7,代码来源:feed.py


示例17: fetch

 def fetch(self):
     """
     This will fetch the post of a tea review
     and add all info to the base class
     """
     try:                                                                                 
         self.genre ="Review"
         self.parent_uri = self.currenturi
         if not self._setSoup():
             log.info(self.log_msg('Task uri not set, cannot proceed') )
             return False
         if self.currenturi == 'http://www.teadiscussion.com/categories/index.php':
             for each in ['http://www.teadiscussion.com/categories/' + each['href'] for each in self.soup.find('p',text='Reviews of tea by types of tea:').parent.findNext('ul').findAll('a')]:
                 self.currenturi = each
                 if self._setSoup():
                     for href in [ahref['href'] for ahref in self.soup.findAll('a','categoryTitle')]:
                         temp_task=self.task.clone()
                         temp_task.instance_data[ 'uri' ] = normalize( href )
                         self.linksOut.append( temp_task )
             return True
         if not self.__getParentPage():
             log.info(self.log_msg('Parent page not posted '))
         self.__addReview()
         return True
     except:
         log.exception(self.log_msg('Error in Fetch'))
         return False                
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:teadiscussionconnector.py


示例18: __addPosts

    def __addPosts(self, links, parent_list):
        """Given a list of links to the discussion post, fetch the post contents and the author info
        """
        h = HTTPConnection()
        for link in links:
            try:
                page = {}
                object_id = re.search('objectID=(\d+)', link).group(1)
                link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id)
                # Using the redirected url instead of the url given by the search page
                self.currenturi = link
                page['uri'] = normalize(link)
                log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi)))
                if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                    self.task.instance_data.get('update'), parent_list=parent_list):
                    # No need to pick this page
                    continue
                res = self._getHTML()

                self.rawpage = res['result']
                self._setCurrentPage()
                # First try extracting from the post body
                if not self.__extractPostBody(page, object_id):
                    # if that fails, extract from the replies
                    self.__extractReplyBody(page, object_id)

            except:
                log.exception(self.log_msg("exception in extracting page"))
                continue
            page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ")

            checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest()
            id = None
            if self.session_info_out=={}:
                id = self.task.id
            result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi,
                                       checksum, 'Post', self.task.instance_data.get('update'),
                                       parent_list=parent_list, Id=id)
            if result['updated']:
                page['path'] =  page['parent_path'] = parent_list
                page['path'].append(self.currenturi)
                page['priority']=self.task.priority
                page['level']=self.task.level
                page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                page['connector_instance_log_id'] = self.task.connector_instance_log_id
                page['connector_instance_id'] = self.task.connector_instance_id
                page['workspace_id'] = self.task.workspace_id
                page['client_id'] = self.task.client_id  # TODO: Get the client from the project 
                page['client_name'] = self.task.client_name
                page['last_updated_time'] = page['pickup_date']
                page['versioned'] = False
                page['entity'] = 'Review'
                page['category'] = self.task.instance_data.get('category','')
                page['task_log_id']=self.task.id
                page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                

            # Calculate the hash and get the session info thingy
            self.pages.append(page)
        return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:vmwareconnector.py


示例19: get_feed_from_url

    def get_feed_from_url(cls, url):
        feed = None
    
        def by_url(address):
            feed = cls.objects.filter(feed_address=address)
            if not feed:
                duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=address).order_by('pk')
                if duplicate_feed:
                    feed = [duplicate_feed[0].feed]
                
            return feed
            
        url = urlnorm.normalize(url)
        feed = by_url(url)

        if feed:
            feed = feed[0]
        else:
            if feedfinder.isFeed(url):
                feed = cls.objects.create(feed_address=url)
                feed = feed.update()
            else:
                feed_finder_url = feedfinder.feed(url)
                if feed_finder_url:
                    feed = by_url(feed_finder_url)
                    if not feed:
                        feed = cls.objects.create(feed_address=feed_finder_url)
                        feed = feed.update()
                    else:
                        feed = feed[0]
                    
        return feed
开发者ID:rkabir,项目名称:NewsBlur,代码行数:32,代码来源:models.py


示例20: process_outline

    def process_outline(self, outline):
        folders = []
    
        for item in outline:
            if not hasattr(item, 'xmlUrl'):
      

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python user_auth.has_permission函数代码示例发布时间:2022-05-26
下一篇:
Python update.update函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap