• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python scraper_utils.cleanse_title函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中salts_lib.scraper_utils.cleanse_title函数的典型用法代码示例。如果您正苦于以下问题:Python cleanse_title函数的具体用法?Python cleanse_title怎么用?Python cleanse_title使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了cleanse_title函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: search

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = '/search/' + urllib.quote_plus(title)
     html = self._http_get(search_url, require_debrid=True, cache_limit=1)
     if video_type == VIDEO_TYPES.TVSHOW:
         seen_urls = {}
         for _attr, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}):
             if CATEGORIES[video_type] not in post: continue
             match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I)
             if match:
                 show_url, match_title = match.groups()
                 if show_url in seen_urls: continue
                 result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                 seen_urls[show_url] = result
                 results.append(result)
     elif video_type == VIDEO_TYPES.MOVIE:
         norm_title = scraper_utils.normalize_title(title)
         headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
         posts = [result.content for result in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')})]
         for heading, post in zip(headings, posts):
             if CATEGORIES[video_type] not in post or self.__too_old(post): continue
             post_url, post_title = heading
             meta = scraper_utils.parse_movie_link(post_title)
             full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'], meta['height'])
             match_year = meta['year']
             
             match_norm_title = scraper_utils.normalize_title(meta['title'])
             if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year):
                 result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year}
                 results.append(result)
         
     return results
开发者ID:CYBERxNUKE,项目名称:xbmc-addon,代码行数:32,代码来源:2ddl_scraper.py


示例2: search

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/index.php?search=%s&image.x=0&image.y=0')
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=.25)

        results = []
        # Are we on a results page?
        if not re.search('window\.location', html):
            pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)'
            for match in re.finditer(pattern, html, re.DOTALL):
                match_title_year, match_url = match.groups('')
                # skip porn
                if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue
                
                match_title_year = re.sub('</?.*?>', '', match_title_year)
                match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match_title = match_title_year
                    match_year = ''
                
                if not year or not match_year or year == match_year:
                    result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)
        else:
            match = re.search('window\.location\s+=\s+"([^"]+)', html)
            if match:
                url = match.group(1)
                if url != 'movies.php':
                    result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year}
                    results.append(result)
        return results
开发者ID:AMOboxTV,项目名称:AMOBox.LegoBuild,代码行数:33,代码来源:filmikz_scraper.py


示例3: search

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/index.php')
        params = {'search': title, 'image.x': 0, 'image.y': 0}
        html = self._http_get(search_url, params=params, cache_limit=1)

        # Are we on a results page?
        if not re.search('window\.location', html):
            pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)'
            for match in re.finditer(pattern, html, re.DOTALL):
                match_title_year, match_url = match.groups('')
                # skip porn
                if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue
                match_title_year = re.sub('</?.*?>', '', match_title_year)
                match_title, match_year = scraper_utils.extra_year(match_title_year)
                if not year or not match_year or year == match_year:
                    result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)
        else:
            match = re.search('window\.location\s+=\s+"([^"]+)', html)
            if match:
                url = match.group(1)
                if url != 'movies.php':
                    result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year}
                    results.append(result)
        return results
开发者ID:EPiC-APOC,项目名称:repository.xvbmc,代码行数:26,代码来源:filmikz_scraper.py


示例4: __get_ok

 def __get_ok(self, embed, flashvars):
     hosters = []
     link = flashvars[0].attrs['value']
     match = re.search('metadataUrl=([^"]+)', link)
     if match:
         referer = scraper_utils.cleanse_title(urllib.unquote(embed[0].attrs['data']))
         ok_url = scraper_utils.cleanse_title(urllib.unquote(match.group(1)))
         html = self._http_get(ok_url, data='ok', headers={'Referer': referer}, cache_limit=.25)
         js_data = scraper_utils.parse_json(html, ok_url)
         stream_url = js_data.get('movie', {}).get('url')
         if stream_url is not None:
             host = urlparse.urlparse(stream_url).hostname
             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': stream_url, 'direct': False, 'subs': 'Turkish Subtitles'}
             hosters.append(hoster)
     return hosters
开发者ID:CYBERxNUKE,项目名称:xbmc-addon,代码行数:15,代码来源:dizibox_scraper.py


示例5: search

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/search/%s' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=.25)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'name_top'}):
            match = re.search('href="([^"]+)[^>]+>([^<]+)', item)
            if match:
                url, match_title_year = match.groups()
                is_season = re.search('Season\s+(\d+)', match_title_year, re.IGNORECASE)
                if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                    match_year = ''
                    if video_type == VIDEO_TYPES.MOVIE:
                        match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year)
                        if match:
                            match_title, match_year = match.groups()
                        else:
                            match_title = match_title_year
                    else:
                        match_title = match_title_year
                        if season and int(is_season.group(1)) != int(season):
                            continue
                    
                    if not year or not match_year or year == match_year:
                        result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
                        results.append(result)

        return results
开发者ID:c0ns0le,项目名称:YCBuilds,代码行数:27,代码来源:watchhd_scraper.py


示例6: search

    def search(self, video_type, title, year, season=''):
        results = []
        if video_type == VIDEO_TYPES.TVSHOW:
            url = urlparse.urljoin(self.base_url, '/series/all/')
            html = self._http_get(url, cache_limit=8)
    
            links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href')
            titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'})
            items = zip(links, titles)
        else:
            url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title))
            data = {'q': title, 'go': 'Search'}
            html = self._http_get(url, data=data, cache_limit=8)
            match = re.search('you can search again in (\d+) seconds', html, re.I)
            if match:
                wait = int(match.group(1))
                if wait > self.timeout: wait = self.timeout
                time.sleep(wait)
                html = self._http_get(url, data=data, cache_limit=0)
                
            pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)'
            items = re.findall(pattern, html, re.DOTALL)

        norm_title = scraper_utils.normalize_title(title)
        for item in items:
            url, match_title = item
            if norm_title in scraper_utils.normalize_title(match_title):
                result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
                results.append(result)

        return results
开发者ID:freeworldxbmc,项目名称:KAOSbox-Repo,代码行数:31,代码来源:moviestorm_scraper.py


示例7: search

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/movies.php?list=search&search=')
        search_url += urllib.quote_plus(title)
        cookies = {'onlylanguage': 'en', 'lang': 'en'}
        html = self._http_get(search_url, cookies=cookies, cache_limit=.25)
        results = []
        pattern = 'id="tdmovies">\s*<a\s+href="([^"]+)">([^<]+).*?id="f7">(.*?)</TD>'
        for match in re.finditer(pattern, html, re.DOTALL):
            url, title, extra = match.groups('')
            if (video_type == VIDEO_TYPES.MOVIE and '(TVshow)' in title) or (video_type == VIDEO_TYPES.TVSHOW and '(TVshow)' not in title):
                continue

            title = title.replace('(TVshow)', '')
            title = title.strip()

            r = re.search('>(\d{4})<', extra)
            if r:
                match_year = r.group(1)
            else:
                match_year = ''

            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': match_year}
                results.append(result)
        return results
开发者ID:ScriptUp,项目名称:salts,代码行数:25,代码来源:movie4k_scraper.py


示例8: search

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/search/%s.html' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=1)
        fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'cfv'})
        if not fragment: return results
        
        norm_title = scraper_utils.normalize_title(title)
        for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'li'):
            is_season = dom_parser2.parse_dom(item, 'div', {'class': 'status'})
            if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON):
                match = dom_parser2.parse_dom(item, 'a', req=['href', 'title'])
                if not match: continue
                
                match_title = match[0].attrs['title']
                match_url = match[0].attrs['href']
                match_year = ''
                if video_type == VIDEO_TYPES.SEASON:
                    if season and not re.search('Season\s+%s$' % (season), match_title, re.I):
                        continue
                else:
                    match = re.search('-(\d{4})[-.]', match_url)
                    if match:
                        match_year = match.group(1)
                
                match_norm_title = scraper_utils.normalize_title(match_title)
                title_match = (norm_title in match_norm_title) or (match_norm_title in norm_title)
                if title_match and (not year or not match_year or year == match_year):
                    result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                    results.append(result)

        return results
开发者ID:CYBERxNUKE,项目名称:xbmc-addon,代码行数:32,代码来源:moviesub_scraper.py


示例9: search

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search/')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}):
            name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'})
            if name:
                match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0])
                if match:
                    match_url, match_title_year = match.groups()
                    if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue
                    
                    match_title_year = re.sub('</?[^>]*>', '', match_title_year)
                    match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year)
                    match_title_year = match_title_year.replace('&#8217;', "'")
                    match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year)
                    if match:
                        match_title, match_year = match.groups()
                    else:
                        match_title = match_title_year
                        match_year = ''
    
                    if not match_year:
                        year_span = dom_parser.parse_dom(fragment, 'span', {'class': 'year'})
                        if year_span:
                            year_text = dom_parser.parse_dom(year_span[0], 'a')
                            if year_text:
                                match_year = year_text[0].strip()
    
                    if not year or not match_year or year == match_year:
                        result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year}
                        results.append(result)

        return results
开发者ID:AMOboxTV,项目名称:AMOBox.LegoBuild,代码行数:35,代码来源:moviexk_scraper.py


示例10: __alt_search

    def __alt_search(self, video_type, title, year, season=''):
        results = []
        params = title.lower()
        if year: params += ' %s' % (year)
        if video_type == VIDEO_TYPES.SEASON and season:
            params += ' Season %s' % (season)
        params = {'key': params}
        search_url = urlparse.urljoin(self.base_url, '/search')
        html = self._http_get(search_url, params=params, cache_limit=1)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'caption'}):
            match = re.search('href="([^"]+)[^>]+>(.*?)<span[^>]*>', item)
            if match:
                match_url, match_title = match.groups()
                is_season = re.search('-season-\d+', match_url)
                if (video_type == VIDEO_TYPES.MOVIE and not is_season) or (video_type == VIDEO_TYPES.SEASON and is_season):
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search('season-0*%s$' % (season), match_url): continue
                        
                    match_title = re.sub('</?[^>]*>', '', match_title)
                    match_title = re.sub('\s+Full\s+Movie', '', match_title)
                    match = re.search('-(\d{4})(?:$|-)', match_url)
                    if match:
                        match_year = match.group(1)
                    else:
                        match_year = ''
                    
                    if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                        result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                        results.append(result)

        return results
开发者ID:EPiC-APOC,项目名称:repository.xvbmc,代码行数:32,代码来源:hdmovie14_scraper.py


示例11: search

    def search(self, video_type, title, year, season=''):
        search_url = self.base_url
        if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]:
            search_url += '/tvshow'

        search_url += '/advanced-search.php?search='
        search_url += urllib.quote_plus(title)
        search_url += '&year=' + urllib.quote_plus(str(year))
        search_url += '&advanced_search=Search'

        html = self._http_get(search_url, cache_limit=.25)
        results = []
        for element in dom_parser.parse_dom(html, 'div', {'class': 'list_box_title'}):
            match = re.search('href="([^"]+)"\s+title="(?:Watch )?([^"]+)', element)
            if match:
                url, match_title_year = match.groups()
                match = re.search('(.*?)(?:\s+\(?\s*(\d{4})\s*\)?)', match_title_year)
                if match:
                    match_title, match_year = match.groups()
                else:
                    match_title = match_title_year
                    match_year = ''
                
                if not year or not match_year or year == match_year:
                    result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)
        return results
开发者ID:AMOboxTV,项目名称:AMOBox.LegoBuild,代码行数:27,代码来源:merdb_scraper.py


示例12: search

    def search(self, video_type, title, year, season=''):  # @UnusedVariable
        results = []
        search_url = scraper_utils.urljoin(self.base_url, '/movies.php')
        cookies = {'onlylanguage': 'en', 'lang': 'en'}
        params = {'list': 'search', 'search': title}
        html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8)
        for _attrs, content in dom_parser2.parse_dom(html, 'TR', {'id': re.compile('coverPreview\d+')}):
            match = dom_parser2.parse_dom(content, 'a', req='href')
            if not match: continue
            
            match_url, match_title = match[0].attrs['href'], match[0].content
            is_show = re.search('\(tvshow\)', match_title, re.I)
            if (video_type == VIDEO_TYPES.MOVIE and is_show) or (video_type == VIDEO_TYPES.TVSHOW and not is_show):
                continue

            match_title = match_title.replace('(TVshow)', '')
            match_title = match_title.strip()
            
            match_year = ''
            for _attrs, div in dom_parser2.parse_dom(content, 'div'):
                match = re.match('\s*(\d{4})\s*', div)
                if match:
                    match_year = match.group(1)

            if not year or not match_year or year == match_year:
                result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                results.append(result)
        return results
开发者ID:CYBERxNUKE,项目名称:xbmc-addon,代码行数:28,代码来源:movie4k_scraper.py


示例13: search

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html')
     search_url = search_url % (urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=0)
     results = []
     for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}):
         match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title')
         url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href')
         if match_title and url:
             match_title, url = match_title[0], url[0]
             is_season = re.search('Season\s+(\d+)$', match_title, re.I)
             if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
                 match_year = ''
                 if video_type == VIDEO_TYPES.MOVIE:
                     match_year = dom_parser.parse_dom(thumb, 'div', {'class': '[^"]*status-year[^"]*'})
                     if match_year:
                         match_year = match_year[0]
                 else:
                     if season and int(is_season.group(1)) != int(season):
                         continue
                 
                 if not year or not match_year or year == match_year:
                     result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                     results.append(result)
     return results
开发者ID:ScriptUp,项目名称:salts,代码行数:25,代码来源:tunemovie_scraper.py


示例14: search

    def search(self, video_type, title, year, season=''):
        results = []
        html = self._http_get(self.base_url, params={'s': title}, cache_limit=8)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
            match = re.search('href="([^"]+)', item)
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'})
            year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'})
            if match and match_title:
                url = match.group(1)
                match_title = match_title[0]
                if re.search('\d+\s*x\s*\d+', match_title): continue  # exclude episodes
                match_title, match_year = scraper_utils.extra_year(match_title)
                if not match_year and year_frag:
                    match_year = year_frag[0]

                match = re.search('(.*?)\s+\d{3,}p', match_title)
                if match:
                    match_title = match.group(1)
                
                extra = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'})
                if extra:
                    match_title += ' [%s]' % (extra[0])
                    
                if not year or not match_year or year == match_year:
                    result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
                    results.append(result)

        return results
开发者ID:EPiC-APOC,项目名称:repository.xvbmc,代码行数:28,代码来源:hevcbluray_scraper.py


示例15: search

    def search(self, video_type, title, year, season=''):
        search_url = urlparse.urljoin(self.base_url, '/movie/search/')
        title = re.sub('[^A-Za-z0-9 ]', '', title)
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        results = []
        for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}):
            match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'})
            match_url = re.search('href="([^"]+)', item, re.DOTALL)
            match_year = re.search('class="jt-info">(\d{4})<', item)
            is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'})
            
            if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes):
                if match_title and match_url:
                    match_title = match_title[0]
                    match_title = re.sub('</?h2>', '', match_title)
                    match_title = re.sub('\s+\d{4}$', '', match_title)
                    if video_type == VIDEO_TYPES.SEASON:
                        if season and not re.search('Season\s+%s$' % (season), match_title): continue
                        
                    url = urlparse.urljoin(match_url.group(1), 'watching.html')
                    match_year = match_year.group(1) if match_year else ''
    
                    if not year or not match_year or year == match_year:
                        result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
                        results.append(result)

        return results
开发者ID:freeworldxbmc,项目名称:KAOSbox-Repo,代码行数:28,代码来源:123movies_scraper.py


示例16: search

    def search(self, video_type, title, year, season=''):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search/%s.html' % (urllib.quote_plus(title)))
        html = self._http_get(search_url, cache_limit=1)
        fragment = dom_parser.parse_dom(html, 'ul', {'class': 'cfv'})
        if fragment:
            for item in dom_parser.parse_dom(fragment[0], 'li'):
                is_season = dom_parser.parse_dom(item, 'div', {'class': 'status'})
                if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON):
                    match_url = dom_parser.parse_dom(item, 'a', ret='href')
                    match_title = dom_parser.parse_dom(item, 'a', ret='title')
                    if match_url and match_title:
                        match_title = match_title[0]
                        match_url = match_url[0]
                        match_year = ''
                        if video_type == VIDEO_TYPES.SEASON:
                            if season and not re.search('Season\s+%s$' % (season), match_title, re.I):
                                continue
                        else:
                            match = re.search('-(\d{4})\.html', match_url)
                            if match:
                                match_year = match.group(1)
                        
                        if not year or not match_year or year == match_year:
                            result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                            results.append(result)

        return results
开发者ID:EPiC-APOC,项目名称:repository.xvbmc,代码行数:28,代码来源:moviesub_scraper.py


示例17: search

    def search(self, video_type, title, year, season=""):
        results = []
        search_url = urlparse.urljoin(self.base_url, "/search?query=%s")
        search_url = search_url % (urllib.quote_plus(title))
        html = self._http_get(search_url, cache_limit=8)
        for item in dom_parser.parse_dom(html, "div", {"class": "one_movie-item"}):
            match_url = dom_parser.parse_dom(item, "a", ret="href")
            match_title = dom_parser.parse_dom(item, "img", ret="alt")
            media_type = dom_parser.parse_dom(item, "div", {"class": "movie-series"})
            if not media_type:
                media_type = VIDEO_TYPES.MOVIE
            elif media_type[0] == "TV SERIE":
                media_type = VIDEO_TYPES.TVSHOW

            if match_url and match_title and video_type == media_type:
                match_url = match_url[0]
                match_title = match_title[0]

                match_year = re.search("-(\d{4})-", match_url)
                if match_year:
                    match_year = match_year.group(1)
                else:
                    match_year = ""

                if not year or not match_year or year == match_year:
                    result = {
                        "url": scraper_utils.pathify_url(match_url),
                        "title": scraper_utils.cleanse_title(match_title),
                        "year": match_year,
                    }
                    results.append(result)

        return results
开发者ID:EPiC-APOC,项目名称:repository.xvbmc,代码行数:33,代码来源:moviewatcher_scraper.py


示例18: search

 def search(self, video_type, title, year, season=''):  # @UnusedVariable
     results = []
     search_url = base64.decodestring(SEARCH_URL) % (urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=2)
     if html:
         js_data = scraper_utils.parse_json(html)
         search_meta = scraper_utils.parse_episode_link(title)
         for item in js_data.get('results', []):
             metatags = item.get('richSnippet', {}).get('metatags', {})
             post_date = metatags.get('articlePublishedTime')
             if post_date:
                 post_date = re.sub('[+-]\d+:\d+$', '', post_date)
                 post_date = scraper_utils.to_datetime(post_date, '%Y-%m-%dT%H:%M:%S').date()
                 if self.__too_old(post_date): continue
             
             match_title = metatags.get('ogTitle', '')
             if not match_title:
                 match_title = item['titleNoFormatting']
                 match_title = re.sub(re.compile('\s*-\s*Scene\s*Down$', re.I), '', match_title)
             match_url = item['url']
             match_year = ''
             item_meta = scraper_utils.parse_episode_link(match_title)
             if scraper_utils.meta_release_check(video_type, search_meta, item_meta):
                 result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                 results.append(result)
     
     if not results:
         results = self.__site_search(video_type, title, year)
         
     return results
开发者ID:CYBERxNUKE,项目名称:xbmc-addon,代码行数:30,代码来源:scenedown_scraper.py


示例19: search

 def search(self, video_type, title, year, season=''):
     search_url = urlparse.urljoin(self.base_url, '/search/%s.html')
     search_url = search_url % (urllib.quote_plus(title))
     html = self._http_get(search_url, cache_limit=1)
     results = []
     fragment = dom_parser.parse_dom(html, 'div', {'class': 'movie'})
     if fragment:
         for item in dom_parser.parse_dom(fragment[0], 'li'):
             match_url = dom_parser.parse_dom(item, 'a', ret='href')
             match_title = dom_parser.parse_dom(item, 'span', {'class': 'text'})
             match_year = dom_parser.parse_dom(item, 'span', {'class': 'year'})
             if match_url and match_title:
                 match_url = match_url[0]
                 match_title = re.sub('</?strong>', '', match_title[0])
                 is_season = re.search('Season\s+(\d+)$', match_title, re.I)
                 if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON):
                     if video_type == VIDEO_TYPES.MOVIE:
                         if match_year:
                             match_year = match_year[0]
                         else:
                             match_year = ''
                     else:
                         if season and int(is_season.group(1)) != int(season):
                             continue
                         match_year = ''
                 
                     result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
                     results.append(result)
     return results
开发者ID:freeworldxbmc,项目名称:KAOSbox-Repo,代码行数:29,代码来源:vivoto_scraper.py


示例20: __movie_search

    def __movie_search(self, title, year):
        results = []
        search_url = urlparse.urljoin(self.base_url, '/search?q=')
        search_url += urllib.quote_plus(title)
        html = self._http_get(search_url, cache_limit=1)
        norm_title = scraper_utils.normalize_title(title)
        for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}):
            match_url = dom_parser.parse_dom(item, 'a', ret='href')
            match_title = dom_parser.parse_dom(item, 'img', ret='alt')
            match_year = ''
            if match_url and match_title:
                match_url = match_url[0]
                match_title = match_title[0]
                
                if match_year:
                    match_year = match_year[0]
                else:
                    match_year = ''
        
                if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
                    result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
                    results.append(result)

        return results
        
开发者ID:freeworldxbmc,项目名称:KAOSbox-Repo,代码行数:24,代码来源:moviewatcher_scraper.py



注:本文中的salts_lib.scraper_utils.cleanse_title函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python scraper_utils.disable_sub_check函数代码示例发布时间:2022-05-27
下一篇:
Python scraper_utils.blog_get_quality函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap