• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python soupselect.select函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中soupselect.select函数的典型用法代码示例。如果您正苦于以下问题:Python select函数的具体用法?Python select怎么用?Python select使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了select函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: extractPage

def extractPage(url, pagination=True):
    print 'Extracting : %s' % url
    result = []
    page = request(url)
    soup = BeautifulSoup(page)
    info = select(soup, '.courseInfo')
    for record in info:
        courseNumber = record.find('span', {'class': 'courseNumber'}).text
        courseTitle = record.find('span', {'class': 'courseTitle'}).text
        courseAttrs = record.find('div', {'class': 'courseAttributes'}).text
        terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()] 
        if terms:
            courseTime = str(terms[0].split(':')[1]).strip()
        else:
            courseTime = "not given this year"

        obj = {
                'title': courseTitle,
                'number': courseNumber,
                'time': courseTime
                }
        result.append(obj)

    subresults = []
    if pagination:
        pages = select(soup, '#pagination a')
        pagesLinks = href(pages)
        for l in set(pagesLinks):
            subresults.extend(extractPage(BASE + l, False))
    if subresults:
        result.extend(subresults) 
    return result
开发者ID:dahabit,项目名称:scrap,代码行数:32,代码来源:app.py


示例2: parse_obituary

def parse_obituary(url,category):
	"""
	Extracts the necessary information from a single obituary page
	"""
	page = requests.get(url)
	soup = Soup(page.text)
	try:
		date = select(soup, 'p strong')[0].contents[0]
		date = date[date.rfind('died ')+5:].strip()
		cal = pdt.Calendar()
		print >> sys.stderr, 'parsing',date
		date = cal.parseDateText(date)
	except:
		print >> sys.stderr, 'failed to parse'
		return
	date = str('%s/%s/%s' % (date[2],date[1],date[0]))
	publisher = 'Telegraph'
	type = 'obituaries'
	name = select(soup, '.storyHead h1')[0].contents[0]
	content = ''
	for para in select(soup, '#mainBodyArea p'):
		if len(para.contents) > 0:
			content = content + para.contents[0]

	content = content.strip().replace('"','\'')		
	content = content.strip().replace('\n','')
	
	print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"),
													publisher.encode("UTF-8"),
													type.encode("UTF-8"),
													name.encode("UTF-8"),
													content.encode("UTF-8"),
													category.encode("UTF-8"))
开发者ID:MRdNk,项目名称:swbd,代码行数:33,代码来源:scrape_obituaries.py


示例3: get_raw_boxscore_data

  def get_raw_boxscore_data(self, boxscore_soup):
    # Load boxscore data. No logic here, just splitting from HTML into more
    # processable data.
    boxscore_data = []
    boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr')
    for player_data in boxscore_rows:
      cells = select(player_data, 'td')
      if len(cells) == 13:
        # This order should match the boxscore table on espn
        (player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk,
            to, pf, pts) = [
          cell.text for cell in cells
        ]

        if not player_name:
          continue

        fgm, fga = fgma.split('-')
        tpm, tpa = tpma.split('-')
        ftm, fta = ftma.split('-')

        (minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
            pf, pts) = map(int, [
          minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
              pf, pts
        ])

        boxscore_data.append({
          'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga,
          'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta,
          'oreb': oreb, 'reb': reb,
          'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts,
        })

    return boxscore_data
开发者ID:gunsch,项目名称:ncaa-stats,代码行数:35,代码来源:scraper.py


示例4: _extract_predictions

	def _extract_predictions(self, html):
		if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html:
			return None
		else:
			predictions = []
			soup = BeautifulSoup(html)	

			# get the primary/imminent prediction		
			try:
				minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0])
			except:
				return None
			if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()):
				predictions.append(0)
			else:
				predictions.append(int(minutes))

			# get the other predictions
			for m in select(soup, '.predictionNumberForOtherPreds'):
				m = self._clean_prediction_html(m)
				try:
					predictions.append(int(m))
				except:
					pass

			return predictions
开发者ID:sbma44,项目名称:markmograph,代码行数:26,代码来源:nextbus.py


示例5: expandDocument

	def expandDocument(self,header,content,config=None):
		raise "obsolete"
		part = self.partDocument(header["document"],config)
		soup = part.expandSoup(content)
		header = part.get_collapsed_header(header=header)
		stateful_doc = "stateful" in header and header["stateful"] is True

		if stateful_doc:
			script = part.statefulConfigScript()
			if script:
				script_tag = soup.new_tag("script")
				script_tag["type"] = "application/config"
				script_tag.string = script
				soup.body.append(script_tag)

		# fill in meta tags
		self._applyMetaAndTitle(soup,header,config)

		if config["appcache"] == False:
			for h in select(soup,"html"):
				del h["manifest"]
		elif "manifest" in header:
			for h in select(soup,"html"):
				h["manifest"] = header["manifest"]

		if "Content-Language" in header:
			for h in select(soup,"html"):
				h["lang"] = header["Content-Language"]

		# offline markers
		lists = {
			"offline": self._getOfflineList(soup,header),
		}

		return soup.prettify(), lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:35,代码来源:browsers.py


示例6: fetch_review_counts

def fetch_review_counts(appid):
    class FetchError(StandardError):
        pass

    url = 'http://store.steampowered.com/app/%i/' % appid
    request = urllib.urlopen(url)
    if request.code < 200 or request.code > 299:
        raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code})

    soup = BeautifulSoup(request)

    positive_count = ''
    positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count')
    if len(positive_count_elements) > 0:
        positive_count = get_count(positive_count_elements[0])

    if not positive_count:
        print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url

    negative_count = ''
    negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count')
    if len(negative_count_elements) > 0:
        negative_count = get_count(negative_count_elements[0])

    if not negative_count:
        print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url

    return positive_count, negative_count
开发者ID:jorgenpt,项目名称:steam-tools,代码行数:28,代码来源:steam_fetch_review_counts.py


示例7: scrapeBlog

def scrapeBlog(blog):
	global completed
	blogurl = blog['postUrl']
	blogData = {}
	try:
		soup = Soup(urllib2.urlopen(blogurl))
		post = select(soup, 'div.post-body')

		title = select(soup, 'h1.title')
		titleNoTags = Soup(str(title))
		rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip()
		#print rawTitle

		noScript = Soup(str(post))
		rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip()
		#print raw_text

		blogData['source'] = str(rawTitle)
		blogData['title'] = blog['titleNoFormatting']
		blogData['content'] = str(rawText)
		blogData['date'] = blog['publishedDate']
		blogData['url'] = str(blogurl)

	except e:
		pass
	with dataLock:
		data.append(blogData)
		completed += 1
开发者ID:Jbalkind,项目名称:Amazon-Hackathon,代码行数:28,代码来源:blogger.py


示例8: fetch_data

def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
开发者ID:orithena,项目名称:sportswarnbot,代码行数:34,代码来源:bvb.py


示例9: sees_an_element

    def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None):
        """ Tests for the presence of a specified element on the current page...

        self.alice.sees_an_element(doc, id="element_id")
        self.alice.sees_an_element(doc, "element")
        self.alice.sees_an_element(doc, "div", "element_css_class")
        self.alice.sees_an_element(doc, selector="#myid element.bar")
        """
        selector = "any"
        if id:
            displayed_element = doc.find(id=id)
            selector = id
        elif css_selector:
            displayed_elements = select(doc, css_selector)
            displayed_element = displayed_elements[0] if displayed_elements else None
            selector = css_selector
        else:
            if css_class:
                selector = "%s.%s" % (element, css_class)
                displayed_element = select(doc, selector)
            else:
                displayed_element = doc.find(element)
                selector = element
        self.failUnless(displayed_element, "Could not find %s" % (selector))
        return displayed_element
开发者ID:emlprime,项目名称:wizards_duel,代码行数:25,代码来源:tests.py


示例10: Loop_Through_Messages

def Loop_Through_Messages(i): #i = start ID - 1
    
    while i < MaxMSG:
        i += 1
        
        Humanize(2) #Humanize the program by sleeping 0-2 seconds
        
        try:
            soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i))

            MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~')

            msgbodyhtml = select(soup, '.msgarea')[0]
            MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~')
            
            if MSG_Title == '': MSG_Title = '(none)'
            if MSG_Body == '': MSG_Body = '(none)'
            
            Message_Data_to_Table(i, MSG_Title, MSG_Body)
            
            print i, "of", MaxMSG
        except:
            print "ERROR: SCRAPE FAIL ON POSTING ID", i
            
            Check_Column("Title", MSG_Title)
            Check_Column("Body HTML", msgbodyhtml)
            Check_Column("Body Text", MSG_Body)
            
            if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower():
                Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS')
            else:
                Message_Data_to_Table(i, 'FAIL', 'FAIL')
开发者ID:matthew-reilly,项目名称:freecycle,代码行数:32,代码来源:Freecycle_ETL.py


示例11: expand

	def expand(self,header,content,markup=None,config=None):
		"""
		General header/content expansion replacing expandDocument and expandScss
		"""
		lists = {
			"offline": [],
		}

		if "charset" not in header and markup is not None:
			header["charset"] = config["charset"]
		parent_doc = None
		if "document" in header:
			parent_doc = self.partDocument(header["document"],config)
			header = parent_doc.get_collapsed_header(header=header)

		if markup == "scss":
			content = self.expandScss(header,content,config=config)
		elif markup in ("text","xml"):
			pass #TODO consider what to do
		elif markup == "html":
			soup = None
			if parent_doc:
				soup = parent_doc.expandSoup(content)
			else:
				soup = BeautifulSoup(content,"html5lib")

			if "lang" in header:
				pass #TODO mark html element

			# print soup.head
			stateful_doc = "stateful" in header and header["stateful"] is True

			if stateful_doc:
				script = parent_doc.statefulConfigScript()
				if script:
					script_tag = soup.new_tag("script")
					script_tag["type"] = "application/config"
					script_tag.string = script
					soup.body.append(script_tag)

			# fill in meta tags
			self._applyMetaAndTitle(soup,header,config)

			if config["appcache"] == False:
				for h in select(soup,"html"):
					del h["manifest"]
			elif "manifest" in header:
				for h in select(soup,"html"):
					h["manifest"] = header["manifest"]

			if "Content-Language" in header:
				for h in select(soup,"html"):
					h["lang"] = header["Content-Language"]

			# offline markers
			lists["offline"] = self._getOfflineList(soup,header)
			content = soup.encode()

		return header, content, lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:59,代码来源:browsers.py


示例12: get_games

def get_games(page=1):
    def select_first(soup, selector):
        result = select(soup, selector)
        if result and len(result) > 0:
            return result[0]
        else:
            return None

    def inner_text(soup):
        if isinstance(soup, NavigableString):
            return unicode(soup)
        elif soup.contents:
            return u"".join(inner_text(c) for c in soup.contents)
        else:
            return unicode(soup)

    result = []

    soup = BeautifulSoup(urllib.urlopen(search_result_url(page)))
    games = select(soup, "a.search_result_row")
    for game in games:
        href = str(game["href"])
        if re.search("http://store.steampowered.com/app/(\\d+)/", href):
            id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1)
        else:
            logging.error("Error extracting ID, skipping")
            continue
        name = inner_text(select(game, "h4")[0])
        price = select_first(game, ".search_price")
        if price and price.contents:
            price = price.contents[-1].lower()

            if price.find("free") != -1:
                price = float(0)
            elif price.startswith("&#36;"):
                # Grab the last node, which is either the price or the "reduced
                # price"
                try:
                    price = float(price[5:])
                except:
                    logging.error("Price conversion error for %s: '%s'" % (name, price))
                    price = None
            else:
                price = None
                logging.error("Price parse error for %s: '%s'" % (name, price))
        else:
            price = None

        metascore = select_first(game, ".search_metascore")
        if metascore and metascore.string:
            metascore = int(metascore.string)
        else:
            metascore = None

        result.append(Game(id=id, name=name, price=price, metascore=metascore))

    return result
开发者ID:porkbuns,项目名称:steam-price-graph,代码行数:57,代码来源:SteamApi.py


示例13: raw_events

def raw_events(file):
    match = open(file, 'r')
    soup = BeautifulSoup(match.read())
    events = select(soup, 'div#live-text-commentary-wrapper div#live-text')
    more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text')
    for event in events + more_events:
        for child in event.children:
            if type(child) is bs4.element.Tag:
                yield child.getText().strip()
开发者ID:mneedham,项目名称:neo4j-bbc,代码行数:9,代码来源:extractor.py


示例14: get_resources

 def get_resources(self, doc):
     resources = []
     for a in select(doc, 'a'):
         url = a.get('href')
         img = select(a, 'img[src]')[0]
         src = img.get('src')
         f_type = REG_URL_FILE.search(src).group(1).lower()
         resources.append((url, f_type))
     return resources
开发者ID:dedsm,项目名称:coursera,代码行数:9,代码来源:coursera.py


示例15: find_footnotes_and_anchors

def find_footnotes_and_anchors(soup):
    selector = '.sdfootnoteanc'
    footnote_anchors = select(soup, selector)
    #print '\n'.join([str(anc) for anc in footnote_anchors])

    footnotes = []
    for i in range(len(footnote_anchors)):
        selector = '#sdfootnote%s' % (i+1)
        footnotes.extend(select(soup, selector))
    #print '\n'.join([str(f) for f in footnotes])

    return footnote_anchors, footnotes
开发者ID:eaudeweb,项目名称:naaya.content.talkback,代码行数:12,代码来源:Convertor2.py


示例16: parse

 def parse(self):
     if not self.soup:
         return
     out = []
     for tr in select(self.soup, '#content table tr'):
         td = select(tr, 'td')
         if len(td) != 3:
             continue
         name = select(td[1], 'strong')[0].string
         msg = urlizetrunc(striptags(select(td[2], 'div')[0].renderContents()), 30)
         out.append((name, msg))
     self.data = out[:]
开发者ID:msgre,项目名称:djangoproject.cz,代码行数:12,代码来源:fetcher.py


示例17: getLinks

def getLinks(cat, sponsor=True):
    _links = []
    r = s.get(cat)
    soup = soupify(r)
    table = select(soup, 'table.categories')[0] if page != 1 or sponsor==False else select(soup, 'table.categories')[1]

    tr = select(table, 'tr')
    for t in tr:
        link = select(t, 'h3 a')
        if link:
            _links.append(str(dict(link[0].attrs)['href']))

    return _links
开发者ID:mintyPT,项目名称:scrapers,代码行数:13,代码来源:app.py


示例18: process

def process(d, i=None):
    ''' function to process one entry of the table '''
    # to keep a small idea if this is still working (output)
    if i:
        print '%s' % i
    else:
        print '.'

    # extraction of the link of interest
    link = d['penalty_notice_link']

    # if we havn't downloaded the link yet, we do it and keep in into a html file into the temp folder
    if not os.path.exists('./temp/%s.html' % hash(link)):
        r = requests.get(link)
        with open('./temp/%s.html' % hash(link), 'w') as h:
            h.write(r.text.encode('utf-8'))

    # load the hmtl markup
    with open('./temp/%s.html' % hash(link), 'r') as h:
        source = h.read()

    # if we havnt previously extracted the info, we do it now
    if not os.path.exists('./temp/%s.pickle' % hash(link)):

        # to extract info it's usually the same way:
        #   - use BeautifulSoup to create the soup of the source
        #   - use select and some css classes/ids to extract info
        # => it's exaclty what is down below

        soup = BeautifulSoup(source)
        div = select(soup, 'div.cim_content')[0]
        table = select(div, 'table')[0]
        rows = select(table, 'tr')

        address = str(select(rows[2], 'td')[-1].contents[0])
        offence_code = str(select(rows[5], 'td')[-1].contents[0])
        nature = str(select(rows[6], 'td')[-1].contents[0])
        amount = str(select(rows[7], 'td')[-1].contents[0])
        data_penalty = str(select(rows[9], 'td')[-1].contents[0])
        issued_by = str(select(rows[10], 'td')[-1].contents[0])

        d['address'] = address
        d['offence_code'] = offence_code
        d['nature'] = nature
        d['amount'] = amount
        d['data_penalty'] = data_penalty
        d['issued_by'] = issued_by

        with open('./temp/%s.pickle' % hash(link), 'w') as h:
            pickle.dump(d, h)
    else:
        # we have previously extracted the info, we simply load it avoiding extra work
        with open('./temp/%s.pickle' % hash(link), 'r') as h:
            d = pickle.load(h)

    return d
开发者ID:mintyPT,项目名称:scrapers,代码行数:56,代码来源:app2.py


示例19: html_cleanup

def html_cleanup(html, remove_list = (), encoding=None, log=False):
	"""
	Returns (str cleaned_html, bool changes)
	``remove_list``: is list of selectors, currently supported only attribute and class selectors,
	e.g. ['p.[lang]', u'p.список-western', '[orphaned-attribute]', '.orphaned-class-name']
	``encoding`` is html encoding, autodetected if not passed
	"""

	soup = BeautifulSoup(html, fromEncoding=encoding)

	changes = False

	for selector in remove_list:
		m = REC_ATTR.match(selector)
		if m:
			attr, = m.groups()
			for element in select(soup, selector):
				if log:
					print "removing %s[%s]" % (element.name, attr)
				element.attrs = [item for item in element.attrs if item[0] != attr]
				changes = True

		else:
			m = REC_CLASS.match(selector)
			if m:
				tag, cls = m.groups()
				selector = (tag or '') + u'[class]'

				for element in select(soup, selector):

					for i, (attr, value) in enumerate(element.attrs):
						if attr == u'class':
							class_index = i

					classes = filter(None, element.attrs[class_index][1].split(' '))
					try:
						classes.remove(cls)
					except ValueError:	# not in list
						pass
					else:
						if log:
							print "removing %s.%s" % (element.name, cls)
						element.attrs[class_index] = (u'class', ' '.join(classes))
						changes = True

	if changes:
		return soup.prettify(encoding=soup.fromEncoding or soup.originalEncoding), changes
	else:
		return html, changes
开发者ID:HarmonyEnterpriseSolutions,项目名称:toolib,代码行数:49,代码来源:html_cleanup.py


示例20: parseStance

def parseStance(stance):
	issue = select(stance, "div.issue div.issuetext")[0].text
	e = select(stance, "div.quotelink")[0]
	if e.text:
		attrs = map(attrSplit, e.text.split("\" quote"))
		attrMap = {}
		for attr in attrs:
			if len(attr) == 2: attrMap[attr[0]] = attr[1]
		value = attrMap["stand"]
		source = attrMap["src"]
	else:
		value = e["quotestand"]
		source = e["quotesrc"]
	value = value == "colgreencheckmark"
	return [issue, value, source]
开发者ID:rajeem,项目名称:politiko,代码行数:15,代码来源:politiko.py



注:本文中的soupselect.select函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python soupy.Soupy类代码示例发布时间:2022-05-27
下一篇:
Python soup.get_soup函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap