本文整理汇总了Python中soupselect.select函数的典型用法代码示例。如果您正苦于以下问题:Python select函数的具体用法?Python select怎么用?Python select使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了select函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: extractPage
def extractPage(url, pagination=True):
print 'Extracting : %s' % url
result = []
page = request(url)
soup = BeautifulSoup(page)
info = select(soup, '.courseInfo')
for record in info:
courseNumber = record.find('span', {'class': 'courseNumber'}).text
courseTitle = record.find('span', {'class': 'courseTitle'}).text
courseAttrs = record.find('div', {'class': 'courseAttributes'}).text
terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()]
if terms:
courseTime = str(terms[0].split(':')[1]).strip()
else:
courseTime = "not given this year"
obj = {
'title': courseTitle,
'number': courseNumber,
'time': courseTime
}
result.append(obj)
subresults = []
if pagination:
pages = select(soup, '#pagination a')
pagesLinks = href(pages)
for l in set(pagesLinks):
subresults.extend(extractPage(BASE + l, False))
if subresults:
result.extend(subresults)
return result
开发者ID:dahabit,项目名称:scrap,代码行数:32,代码来源:app.py
示例2: parse_obituary
def parse_obituary(url,category):
"""
Extracts the necessary information from a single obituary page
"""
page = requests.get(url)
soup = Soup(page.text)
try:
date = select(soup, 'p strong')[0].contents[0]
date = date[date.rfind('died ')+5:].strip()
cal = pdt.Calendar()
print >> sys.stderr, 'parsing',date
date = cal.parseDateText(date)
except:
print >> sys.stderr, 'failed to parse'
return
date = str('%s/%s/%s' % (date[2],date[1],date[0]))
publisher = 'Telegraph'
type = 'obituaries'
name = select(soup, '.storyHead h1')[0].contents[0]
content = ''
for para in select(soup, '#mainBodyArea p'):
if len(para.contents) > 0:
content = content + para.contents[0]
content = content.strip().replace('"','\'')
content = content.strip().replace('\n','')
print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"),
publisher.encode("UTF-8"),
type.encode("UTF-8"),
name.encode("UTF-8"),
content.encode("UTF-8"),
category.encode("UTF-8"))
开发者ID:MRdNk,项目名称:swbd,代码行数:33,代码来源:scrape_obituaries.py
示例3: get_raw_boxscore_data
def get_raw_boxscore_data(self, boxscore_soup):
# Load boxscore data. No logic here, just splitting from HTML into more
# processable data.
boxscore_data = []
boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr')
for player_data in boxscore_rows:
cells = select(player_data, 'td')
if len(cells) == 13:
# This order should match the boxscore table on espn
(player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk,
to, pf, pts) = [
cell.text for cell in cells
]
if not player_name:
continue
fgm, fga = fgma.split('-')
tpm, tpa = tpma.split('-')
ftm, fta = ftma.split('-')
(minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
pf, pts) = map(int, [
minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
pf, pts
])
boxscore_data.append({
'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga,
'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta,
'oreb': oreb, 'reb': reb,
'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts,
})
return boxscore_data
开发者ID:gunsch,项目名称:ncaa-stats,代码行数:35,代码来源:scraper.py
示例4: _extract_predictions
def _extract_predictions(self, html):
if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html:
return None
else:
predictions = []
soup = BeautifulSoup(html)
# get the primary/imminent prediction
try:
minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0])
except:
return None
if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()):
predictions.append(0)
else:
predictions.append(int(minutes))
# get the other predictions
for m in select(soup, '.predictionNumberForOtherPreds'):
m = self._clean_prediction_html(m)
try:
predictions.append(int(m))
except:
pass
return predictions
开发者ID:sbma44,项目名称:markmograph,代码行数:26,代码来源:nextbus.py
示例5: expandDocument
def expandDocument(self,header,content,config=None):
raise "obsolete"
part = self.partDocument(header["document"],config)
soup = part.expandSoup(content)
header = part.get_collapsed_header(header=header)
stateful_doc = "stateful" in header and header["stateful"] is True
if stateful_doc:
script = part.statefulConfigScript()
if script:
script_tag = soup.new_tag("script")
script_tag["type"] = "application/config"
script_tag.string = script
soup.body.append(script_tag)
# fill in meta tags
self._applyMetaAndTitle(soup,header,config)
if config["appcache"] == False:
for h in select(soup,"html"):
del h["manifest"]
elif "manifest" in header:
for h in select(soup,"html"):
h["manifest"] = header["manifest"]
if "Content-Language" in header:
for h in select(soup,"html"):
h["lang"] = header["Content-Language"]
# offline markers
lists = {
"offline": self._getOfflineList(soup,header),
}
return soup.prettify(), lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:35,代码来源:browsers.py
示例6: fetch_review_counts
def fetch_review_counts(appid):
class FetchError(StandardError):
pass
url = 'http://store.steampowered.com/app/%i/' % appid
request = urllib.urlopen(url)
if request.code < 200 or request.code > 299:
raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code})
soup = BeautifulSoup(request)
positive_count = ''
positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count')
if len(positive_count_elements) > 0:
positive_count = get_count(positive_count_elements[0])
if not positive_count:
print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url
negative_count = ''
negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count')
if len(negative_count_elements) > 0:
negative_count = get_count(negative_count_elements[0])
if not negative_count:
print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url
return positive_count, negative_count
开发者ID:jorgenpt,项目名称:steam-tools,代码行数:28,代码来源:steam_fetch_review_counts.py
示例7: scrapeBlog
def scrapeBlog(blog):
global completed
blogurl = blog['postUrl']
blogData = {}
try:
soup = Soup(urllib2.urlopen(blogurl))
post = select(soup, 'div.post-body')
title = select(soup, 'h1.title')
titleNoTags = Soup(str(title))
rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip()
#print rawTitle
noScript = Soup(str(post))
rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip()
#print raw_text
blogData['source'] = str(rawTitle)
blogData['title'] = blog['titleNoFormatting']
blogData['content'] = str(rawText)
blogData['date'] = blog['publishedDate']
blogData['url'] = str(blogurl)
except e:
pass
with dataLock:
data.append(blogData)
completed += 1
开发者ID:Jbalkind,项目名称:Amazon-Hackathon,代码行数:28,代码来源:blogger.py
示例8: fetch_data
def fetch_data():
def bvbreplace(s):
return "BVB" if "Dortmund" in s else s
doc = None
try:
doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
soup = Soup(doc)
except Exception as e:
raise Exception(u"Error fetching/parsing website: %s" % e)
out = ''
matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
timestr = ''
try:
home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
league = ''
try:
league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
except:
league = select(soup, "div.next-match p span")[2].contents[0].strip()
matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
location = u"Heim" if u"BVB" == home else u"Auswaerts"
out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
except IndexError:
# This means: No next game on the webpage.
sys.exit(1)
except Exception as e:
#print(traceback.format_exc())
raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
return out, matchtime
开发者ID:orithena,项目名称:sportswarnbot,代码行数:34,代码来源:bvb.py
示例9: sees_an_element
def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None):
""" Tests for the presence of a specified element on the current page...
self.alice.sees_an_element(doc, id="element_id")
self.alice.sees_an_element(doc, "element")
self.alice.sees_an_element(doc, "div", "element_css_class")
self.alice.sees_an_element(doc, selector="#myid element.bar")
"""
selector = "any"
if id:
displayed_element = doc.find(id=id)
selector = id
elif css_selector:
displayed_elements = select(doc, css_selector)
displayed_element = displayed_elements[0] if displayed_elements else None
selector = css_selector
else:
if css_class:
selector = "%s.%s" % (element, css_class)
displayed_element = select(doc, selector)
else:
displayed_element = doc.find(element)
selector = element
self.failUnless(displayed_element, "Could not find %s" % (selector))
return displayed_element
开发者ID:emlprime,项目名称:wizards_duel,代码行数:25,代码来源:tests.py
示例10: Loop_Through_Messages
def Loop_Through_Messages(i): #i = start ID - 1
while i < MaxMSG:
i += 1
Humanize(2) #Humanize the program by sleeping 0-2 seconds
try:
soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i))
MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~')
msgbodyhtml = select(soup, '.msgarea')[0]
MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~')
if MSG_Title == '': MSG_Title = '(none)'
if MSG_Body == '': MSG_Body = '(none)'
Message_Data_to_Table(i, MSG_Title, MSG_Body)
print i, "of", MaxMSG
except:
print "ERROR: SCRAPE FAIL ON POSTING ID", i
Check_Column("Title", MSG_Title)
Check_Column("Body HTML", msgbodyhtml)
Check_Column("Body Text", MSG_Body)
if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower():
Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS')
else:
Message_Data_to_Table(i, 'FAIL', 'FAIL')
开发者ID:matthew-reilly,项目名称:freecycle,代码行数:32,代码来源:Freecycle_ETL.py
示例11: expand
def expand(self,header,content,markup=None,config=None):
"""
General header/content expansion replacing expandDocument and expandScss
"""
lists = {
"offline": [],
}
if "charset" not in header and markup is not None:
header["charset"] = config["charset"]
parent_doc = None
if "document" in header:
parent_doc = self.partDocument(header["document"],config)
header = parent_doc.get_collapsed_header(header=header)
if markup == "scss":
content = self.expandScss(header,content,config=config)
elif markup in ("text","xml"):
pass #TODO consider what to do
elif markup == "html":
soup = None
if parent_doc:
soup = parent_doc.expandSoup(content)
else:
soup = BeautifulSoup(content,"html5lib")
if "lang" in header:
pass #TODO mark html element
# print soup.head
stateful_doc = "stateful" in header and header["stateful"] is True
if stateful_doc:
script = parent_doc.statefulConfigScript()
if script:
script_tag = soup.new_tag("script")
script_tag["type"] = "application/config"
script_tag.string = script
soup.body.append(script_tag)
# fill in meta tags
self._applyMetaAndTitle(soup,header,config)
if config["appcache"] == False:
for h in select(soup,"html"):
del h["manifest"]
elif "manifest" in header:
for h in select(soup,"html"):
h["manifest"] = header["manifest"]
if "Content-Language" in header:
for h in select(soup,"html"):
h["lang"] = header["Content-Language"]
# offline markers
lists["offline"] = self._getOfflineList(soup,header)
content = soup.encode()
return header, content, lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:59,代码来源:browsers.py
示例12: get_games
def get_games(page=1):
def select_first(soup, selector):
result = select(soup, selector)
if result and len(result) > 0:
return result[0]
else:
return None
def inner_text(soup):
if isinstance(soup, NavigableString):
return unicode(soup)
elif soup.contents:
return u"".join(inner_text(c) for c in soup.contents)
else:
return unicode(soup)
result = []
soup = BeautifulSoup(urllib.urlopen(search_result_url(page)))
games = select(soup, "a.search_result_row")
for game in games:
href = str(game["href"])
if re.search("http://store.steampowered.com/app/(\\d+)/", href):
id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1)
else:
logging.error("Error extracting ID, skipping")
continue
name = inner_text(select(game, "h4")[0])
price = select_first(game, ".search_price")
if price and price.contents:
price = price.contents[-1].lower()
if price.find("free") != -1:
price = float(0)
elif price.startswith("$"):
# Grab the last node, which is either the price or the "reduced
# price"
try:
price = float(price[5:])
except:
logging.error("Price conversion error for %s: '%s'" % (name, price))
price = None
else:
price = None
logging.error("Price parse error for %s: '%s'" % (name, price))
else:
price = None
metascore = select_first(game, ".search_metascore")
if metascore and metascore.string:
metascore = int(metascore.string)
else:
metascore = None
result.append(Game(id=id, name=name, price=price, metascore=metascore))
return result
开发者ID:porkbuns,项目名称:steam-price-graph,代码行数:57,代码来源:SteamApi.py
示例13: raw_events
def raw_events(file):
match = open(file, 'r')
soup = BeautifulSoup(match.read())
events = select(soup, 'div#live-text-commentary-wrapper div#live-text')
more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text')
for event in events + more_events:
for child in event.children:
if type(child) is bs4.element.Tag:
yield child.getText().strip()
开发者ID:mneedham,项目名称:neo4j-bbc,代码行数:9,代码来源:extractor.py
示例14: get_resources
def get_resources(self, doc):
resources = []
for a in select(doc, 'a'):
url = a.get('href')
img = select(a, 'img[src]')[0]
src = img.get('src')
f_type = REG_URL_FILE.search(src).group(1).lower()
resources.append((url, f_type))
return resources
开发者ID:dedsm,项目名称:coursera,代码行数:9,代码来源:coursera.py
示例15: find_footnotes_and_anchors
def find_footnotes_and_anchors(soup):
selector = '.sdfootnoteanc'
footnote_anchors = select(soup, selector)
#print '\n'.join([str(anc) for anc in footnote_anchors])
footnotes = []
for i in range(len(footnote_anchors)):
selector = '#sdfootnote%s' % (i+1)
footnotes.extend(select(soup, selector))
#print '\n'.join([str(f) for f in footnotes])
return footnote_anchors, footnotes
开发者ID:eaudeweb,项目名称:naaya.content.talkback,代码行数:12,代码来源:Convertor2.py
示例16: parse
def parse(self):
if not self.soup:
return
out = []
for tr in select(self.soup, '#content table tr'):
td = select(tr, 'td')
if len(td) != 3:
continue
name = select(td[1], 'strong')[0].string
msg = urlizetrunc(striptags(select(td[2], 'div')[0].renderContents()), 30)
out.append((name, msg))
self.data = out[:]
开发者ID:msgre,项目名称:djangoproject.cz,代码行数:12,代码来源:fetcher.py
示例17: getLinks
def getLinks(cat, sponsor=True):
_links = []
r = s.get(cat)
soup = soupify(r)
table = select(soup, 'table.categories')[0] if page != 1 or sponsor==False else select(soup, 'table.categories')[1]
tr = select(table, 'tr')
for t in tr:
link = select(t, 'h3 a')
if link:
_links.append(str(dict(link[0].attrs)['href']))
return _links
开发者ID:mintyPT,项目名称:scrapers,代码行数:13,代码来源:app.py
示例18: process
def process(d, i=None):
''' function to process one entry of the table '''
# to keep a small idea if this is still working (output)
if i:
print '%s' % i
else:
print '.'
# extraction of the link of interest
link = d['penalty_notice_link']
# if we havn't downloaded the link yet, we do it and keep in into a html file into the temp folder
if not os.path.exists('./temp/%s.html' % hash(link)):
r = requests.get(link)
with open('./temp/%s.html' % hash(link), 'w') as h:
h.write(r.text.encode('utf-8'))
# load the hmtl markup
with open('./temp/%s.html' % hash(link), 'r') as h:
source = h.read()
# if we havnt previously extracted the info, we do it now
if not os.path.exists('./temp/%s.pickle' % hash(link)):
# to extract info it's usually the same way:
# - use BeautifulSoup to create the soup of the source
# - use select and some css classes/ids to extract info
# => it's exaclty what is down below
soup = BeautifulSoup(source)
div = select(soup, 'div.cim_content')[0]
table = select(div, 'table')[0]
rows = select(table, 'tr')
address = str(select(rows[2], 'td')[-1].contents[0])
offence_code = str(select(rows[5], 'td')[-1].contents[0])
nature = str(select(rows[6], 'td')[-1].contents[0])
amount = str(select(rows[7], 'td')[-1].contents[0])
data_penalty = str(select(rows[9], 'td')[-1].contents[0])
issued_by = str(select(rows[10], 'td')[-1].contents[0])
d['address'] = address
d['offence_code'] = offence_code
d['nature'] = nature
d['amount'] = amount
d['data_penalty'] = data_penalty
d['issued_by'] = issued_by
with open('./temp/%s.pickle' % hash(link), 'w') as h:
pickle.dump(d, h)
else:
# we have previously extracted the info, we simply load it avoiding extra work
with open('./temp/%s.pickle' % hash(link), 'r') as h:
d = pickle.load(h)
return d
开发者ID:mintyPT,项目名称:scrapers,代码行数:56,代码来源:app2.py
示例19: html_cleanup
def html_cleanup(html, remove_list = (), encoding=None, log=False):
"""
Returns (str cleaned_html, bool changes)
``remove_list``: is list of selectors, currently supported only attribute and class selectors,
e.g. ['p.[lang]', u'p.список-western', '[orphaned-attribute]', '.orphaned-class-name']
``encoding`` is html encoding, autodetected if not passed
"""
soup = BeautifulSoup(html, fromEncoding=encoding)
changes = False
for selector in remove_list:
m = REC_ATTR.match(selector)
if m:
attr, = m.groups()
for element in select(soup, selector):
if log:
print "removing %s[%s]" % (element.name, attr)
element.attrs = [item for item in element.attrs if item[0] != attr]
changes = True
else:
m = REC_CLASS.match(selector)
if m:
tag, cls = m.groups()
selector = (tag or '') + u'[class]'
for element in select(soup, selector):
for i, (attr, value) in enumerate(element.attrs):
if attr == u'class':
class_index = i
classes = filter(None, element.attrs[class_index][1].split(' '))
try:
classes.remove(cls)
except ValueError: # not in list
pass
else:
if log:
print "removing %s.%s" % (element.name, cls)
element.attrs[class_index] = (u'class', ' '.join(classes))
changes = True
if changes:
return soup.prettify(encoding=soup.fromEncoding or soup.originalEncoding), changes
else:
return html, changes
开发者ID:HarmonyEnterpriseSolutions,项目名称:toolib,代码行数:49,代码来源:html_cleanup.py
示例20: parseStance
def parseStance(stance):
issue = select(stance, "div.issue div.issuetext")[0].text
e = select(stance, "div.quotelink")[0]
if e.text:
attrs = map(attrSplit, e.text.split("\" quote"))
attrMap = {}
for attr in attrs:
if len(attr) == 2: attrMap[attr[0]] = attr[1]
value = attrMap["stand"]
source = attrMap["src"]
else:
value = e["quotestand"]
source = e["quotesrc"]
value = value == "colgreencheckmark"
return [issue, value, source]
开发者ID:rajeem,项目名称:politiko,代码行数:15,代码来源:politiko.py
注:本文中的soupselect.select函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论