• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python utils.download函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.utils.download函数的典型用法代码示例。如果您正苦于以下问题:Python download函数的具体用法?Python download怎么用?Python download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了download函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # # Pull the RSS feed
  doc = BeautifulSoup(utils.download(RSS_URL))
  results = doc.select("item")
  for result in results:
    report = rss_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # # Pull the recent audit reports.
  doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL))
  results = doc.select("div.block > a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive audit reports
  doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL))
  results = doc.select("div.block a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl))
  results = doc.select("div.block > a")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:smithsonian.py


示例2: run

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2002:  # The oldest page for audit reports
            continue
        doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year)))
        results = doc.select("div.content table tr")
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result, report_type="audit", year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the FOIA reports
    doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
    results = doc.select("div.content table tr")
    for index, result in enumerate(results):
        if not index:
            # Skip the header row
            continue
        report = report_from(result, report_type="other", year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div.content a")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:ncua.py


示例3: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2005:  # This is the earliest audits go back
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div.content")
    if not results:
      raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
    for result in results:
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("report")
  if not results:
    raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:tva.py


示例4: run

def run(options):
  year_range = inspector.year_range(options)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range)
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range)

  # Pull the congressional testimony
  doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:BunsenMcDubbs,项目名称:inspectors-general,代码行数:27,代码来源:tigta.py


示例5: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the general reports
  doc = BeautifulSoup(utils.download(REPORTS_URL))
  results = doc.select("div#mainContent li.mainContenttext a")
  for result in results:
    report = report_from(result, REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive reports
  doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL))
  results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
  for result in results:
    if not result.text:
      continue
    report = report_from(result, REPORT_ARCHIVE_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("div#mainContent li.mainContenttext a")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:28,代码来源:fca.py


示例6: urls_for_topics

  def urls_for_topics(self, topics):
    for topic in topics:
      # Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
      # line).
      self.report_type = None
      if isinstance(topic, tuple):
        topic, report_type = topic
        self.report_type = report_type

      last_page = False

      url = TOPIC_TO_URL[topic]
      page = BeautifulSoup(utils.download(url))
      page_started = self.is_first_page(page)
      if page_started:
        yield url

      for link in page.select('li.pager-item a'):
        next_url = urljoin(url, link['href'])
        next_page = BeautifulSoup(utils.download(next_url))
        if not page_started:
          page_started = self.is_first_page(next_page)
        if page_started:
          yield next_url
        last_page = self.is_last_page(next_page)
        if last_page:
          break
      if last_page:
        continue
    self.report_type = None  # Clear this out afterwards
开发者ID:slobdell,项目名称:inspectors-general,代码行数:30,代码来源:energy.py


示例7: urls_for

  def urls_for(self):
    only = self.options.get('topics')
    if only: # if only...
      only = set(only.split(','))
      only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
              for o in only]
      yield from self.urls_for_topics(only)
      # If there are topics selected, ONLY yield URLs for those.
      return

    # First yield the URLs for the topics that are tangential to the main
    # Calendar Year reports.
    yield from self.urls_for_topics(ADDITIONAL_TOPICS)

    # Not getting reports from specific topics, iterate over all Calendar Year
    # reports.
    page = BeautifulSoup(utils.download(BASE_URL))

    # Iterate over each "Calendar Year XXXX" link
    for li in page.select('.field-items li'):
      md = RE_CALENDAR_YEAR.search(li.text)
      if md:
        cur_year = int(md.group(1))
        if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
          href = li.select('a')[0]['href']
          next_url = urljoin(BASE_URL, href)
          # The first page of reports is yielded.
          yield next_url

          # Next, read all the pagination links for the page and yield those. So
          # far, I haven't seen a page that doesn't have all of the following
          # pages enumerated.
          next_page = BeautifulSoup(utils.download(next_url))
          for link in next_page.select('li.pager-item a'):
            yield urljoin(BASE_URL, link['href'])
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:energy.py


示例8: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:27,代码来源:nasa.py


示例9: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = BeautifulSoup(utils.download(url))
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:29,代码来源:usaid.py


示例10: run

def run(options):
  year_range = inspector.year_range(options, archive)

  doc = BeautifulSoup(utils.download(REPORTS_URL))

  # Pull the semiannual reports
  semiannul_results = doc.select("#AnnualManagementReports select")[0]
  for result in semiannul_results.select("option"):
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the special reports
  special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
  for index, result in enumerate(special_report_table.select("tr")):
    if not index:
      # Skip the header row
      continue
    report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the audit reports
  for year in year_range:
    if year < 2001:  # The oldest fiscal year page available
      continue
    year_url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(year_url))
    for index, result in enumerate(doc.select("#main table tr")):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, year_url, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:rrb.py


示例11: run

def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # Pull the audit reports. Pages are 0-indexed.
  for page in range(0, int(pages) - 1):
    doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
    results = doc.select("span.field-content")
    if not results:
      # No more results, we must have hit the last page
      break

    for result in results:
      report = report_from(result, year_range, report_type='audit')
      if report:
        inspector.save_report(report)

  # Grab the other reports
  for report_type, url in OTHER_REPORT_URLS.items():
    doc = BeautifulSoup(utils.download(url))
    results = doc.select(".views-field")
    if not results:
      results = doc.select(".views-row")
    for result in results:
      report = report_from(result, year_range, report_type)
      if report:
        inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:27,代码来源:fhfa.py


示例12: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2006:  # The oldest year for audit reports
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div#content li")
    for result in results:
      report = audit_report_from(result, url, year, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("div#content li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the Peer Review
  doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
  result = doc.find("div", id='content').find("a", text=True)
  report = peer_review_from(result, year_range)
  inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:28,代码来源:archives.py


示例13: extract_reports_for_oei

def extract_reports_for_oei(year_range):
  topic_name = TOPIC_NAMES["OE"]
  topic_url = TOPIC_TO_URL["OE"]
  root_body = utils.download(topic_url)
  root_doc = BeautifulSoup(root_body)

  letter_urls = set()
  for link in root_doc.select("#leftContentInterior li a"):
    absolute_url = urljoin(topic_url, link['href'])
    absolute_url = strip_url_fragment(absolute_url)
    letter_urls.add(absolute_url)

  if not letter_urls:
    raise inspector.NoReportsFoundError("HHS (OEI first pass)")

  all_results_links = {}
  all_results_unreleased = []
  for letter_url in letter_urls:
    letter_body = utils.download(letter_url)
    letter_doc = BeautifulSoup(letter_body)

    results = letter_doc.select("#leftContentInterior ul li")
    if not results:
      raise inspector.NoReportsFoundError("HHS (OEI %s)" % letter_url)
    for result in results:
      if 'crossref' in result.parent.parent.attrs.get('class', []):
        continue
      if result.parent.parent.attrs.get('id') == 'related':
        continue

      node = result
      while node and node.name != "h2":
        node = node.previous
      if node and node.name == "h2":
        subtopic_name = str(node.text)
      else:
        subtopic_name = "(unknown)"

      links = result.findAll("a")
      if len(links) == 0:
        result.extract()
        all_results_unreleased.append([result, subtopic_name])
      else:
        url = links[0].get("href")
        if url not in all_results_links:
          result.extract()
          all_results_links[url] = [result, subtopic_name]
        else:
          existing_result = all_results_links[url][0]
          for temp in result.contents:
            temp.extract()
            existing_result.append(temp)
          all_results_links[url][1] = "%s, %s" % (all_results_links[url][1], subtopic_name)

  subtopic_url = TOPIC_TO_URL["OE"]
  for result, subtopic_name in itertools.chain(all_results_links.values(), all_results_unreleased):
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:59,代码来源:hhs.py


示例14: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (audit reports")
  for result in results:
    # ignore divider lines
    if result.select("img"): continue

    report = report_from(result, report_type='audit', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
  for result in results:
    if not result.text.strip():
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the case reports
  response = utils.scraper.post(
    url=CASE_REPORTS_URL,
    data=CASE_REPORTS_DATA,
  )
  doc = BeautifulSoup(response.content)
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
  for index, result in enumerate(results):
    if not index or not result.text.strip():  # Skip the header row and empty rows
      continue
    report = case_report_from(result, CASE_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the testimony
  doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL))
  results = doc.select("td.text table tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
  for result in results:
    if not result.text.strip():
      continue
    report = report_from(result, report_type='testimony', year_range=year_range)
    if report:
      inspector.save_report(report)
开发者ID:harrisj,项目名称:inspectors-general,代码行数:55,代码来源:nsf.py


示例15: handle_scanner_args

def handle_scanner_args(args, opts) -> Tuple[dict, list]:
    """
    --analytics: file path or URL to a CSV of participating domains.

    This function also handles checking for the existence of the file,
    downloading it succesfully, and reading the file in order to populate the
    list of analytics domains.
    """
    parser = scan_utils.ArgumentParser(prefix_chars="--")
    parser.add_argument("--analytics", nargs=1, required=True)
    parsed, unknown = parser.parse_known_args(args)
    dicted = vars(parsed)
    should_be_single = ["analytics"]
    dicted = scan_utils.make_values_single(dicted, should_be_single)
    resource = dicted.get("analytics")
    if not resource.endswith(".csv"):
        no_csv = "".join([
            "--analytics should be the file path or URL to a CSV of participating",
            " domains and end with .csv, which '%s' does not" % resource
        ])
        logging.error(no_csv)
        raise argparse.ArgumentTypeError(no_csv)
    try:
        parsed_url = urlparse(resource)
    except:
        raise
    if parsed_url.scheme and parsed_url.scheme in ("http", "https"):
        analytics_path = Path(opts["_"]["cache_dir"], "analytics.csv").resolve()
        try:
            utils.download(resource, str(analytics_path))
        except:
            logging.error(utils.format_last_exception())
            no_csv = "--analytics URL %s not downloaded successfully." % resource
            logging.error(no_csv)
            raise argparse.ArgumentTypeError(no_csv)
    else:
        if not os.path.exists(resource):
            no_csv = "--analytics file %s not found." % resource
            logging.error(no_csv)
            raise FileNotFoundError(no_csv)
        else:
            analytics_path = resource

    analytics_domains = utils.load_domains(analytics_path)
    dicted["analytics_domains"] = analytics_domains
    del dicted["analytics"]

    return (dicted, unknown)
开发者ID:18F,项目名称:domain-scan,代码行数:48,代码来源:analytics.py


示例16: extract_reports_for_subtopic

def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None):
  if subtopic_url.startswith("http://httphttp://"):
    # See notes to IG's web team
    subtopic_url = subtopic_url.replace("http://http", "")

  body = utils.download(subtopic_url)
  doc = BeautifulSoup(body)
  results = doc.select("#body-row02-col02andcol03 a")

  if not results:
    results = doc.select("#body-row02-col01andcol02andcol03 a")
  if not results and "There are currently no reports in this category" not in doc.text:
    raise AssertionError("No report links found for %s" % subtopic_url)

  topic_name = TOPIC_NAMES[topic]
  # Broadcasting Board of Governors is a fully independent agency
  if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors':
    agency = 'bbg'
  else:
    agency = 'state'

  for result in results:
    report = report_from(result, year_range, agency, topic_name, subtopic)
    if report:
      inspector.save_report(report)
开发者ID:BunsenMcDubbs,项目名称:inspectors-general,代码行数:25,代码来源:state.py


示例17: semiannual_report_from

def semiannual_report_from(result, year_range):
  link = result.find("a")

  title = link.text

  # Parse the report title. Ex:
  # 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)'
  published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip()
  published_on_text = published_on_text.replace("September 31", "September 30")  # See note to IG Web team
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % title)
    return

  landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href'))
  landing_page = BeautifulSoup(utils.download(landing_url))

  report_url = landing_page.select("div.filefield-file a")[0].get('href')
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  report = {
    'inspector': "usaid",
    'inspector_url': "https://oig.usaid.gov",
    'agency': "usaid",
    'agency_name': "Agency For International Development",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:usaid.py


示例18: run

def run(options):
  year_range = inspector.year_range(options)

  doc = BeautifulSoup(utils.download(REPORTS_URL))

  # Pull the audit reports
  audit_header = doc.find("a", attrs={"name": 'Audit Reports'})
  audit_list1 = audit_header.find_next("ul").select("li")
  # They have two separate uls for these reports. See note to the IG web team.
  audit_list2 = audit_header.find_next("ul").find_next("ul").select("li")
  results = audit_list1 + audit_list2

  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the inspection reports
  inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'})
  results = inspections_header.find_next("ul").select("li")

  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'})
  results = semiannual_header.find_next("ul").select("li")

  for result in results:
    report = report_from(result, year_range, title_prefix="Semiannual Report - ")
    if report:
      inspector.save_report(report)
开发者ID:JaimeLynSchatz,项目名称:inspectors-general,代码行数:34,代码来源:fec.py


示例19: run

def run(options):
  year_range = inspector.year_range(options)

  for page_url in URLS:
    done = False
    body = utils.download(page_url)
    doc = BeautifulSoup(body)

    maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
    all_p = maincontent.find_all("p")

    for p in all_p:
      for all_text, link_text, link_url in recurse_tree(p, False):
        if link_url == None:
          continue
        if link_url.startswith("mailto:"):
          continue
        if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
          # end of page
          done = True
          break
        if link_url.startswith("https://public.govdelivery.com/"):
          continue
        for index_url in URLS:
          if index_url.find(link_url) != -1:
            continue

        year = DATE_RE.search(all_text).group(3)
        if int(year) not in year_range:
          continue

        report = report_from(all_text, link_text, link_url, page_url)
        inspector.save_report(report)
      if done: break
开发者ID:spulec,项目名称:inspectors-general,代码行数:34,代码来源:exim.py


示例20: run

def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      body = utils.download(year_url)

      doc = BeautifulSoup(body)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:35,代码来源:dot.py



注:本文中的utils.utils.download函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python utils.exec_cmd函数代码示例发布时间:2022-05-26
下一篇:
Python utils.decode_params函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap