• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python tidylib.tidy_document函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tidylib.tidy_document函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_document函数的具体用法?Python tidy_document怎么用?Python tidy_document使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tidy_document函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_doc_with_entity

 def test_doc_with_entity(self):
     h = "é"
     expected = DOC % "é"
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
     
     expected = DOC % "é"
     doc, err = tidy_document(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:9,代码来源:DocsTest.py


示例2: _massage_diff_content

def _massage_diff_content(content):
    tidy_options = {
        'output-xhtml': 0,
        'force-output': 1,
    }
    try:
        content = tidy_document(content, options=tidy_options)
    except UnicodeDecodeError:
        # In case something happens in pytidylib we'll try again with
        # a proper encoding
        content = tidy_document(content.encode('utf-8'), options=tidy_options)
        tidied, errors = content
        content = tidied.decode('utf-8'), errors
    return content
开发者ID:VitorVRS,项目名称:kuma,代码行数:14,代码来源:helpers.py


示例3: marklogic_put_xml

   def marklogic_put_xml(self, item, spider_name):
       # Set the uri and collection
       if (self.ml_transform == ''):
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
       else:
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
       # Set up the XML payload
       payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
       # Decode the <> characters back again
       payload = payload.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"')
       # Run tidy in order to get wel-formed XML
       payload, errors = tidy_document(payload, options={'input-xml': 1})
 
       # Set up the header
       headers = {'Content-Type': 'application/xml'}
 
       ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
       logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
 
       # Call the MarkLogic REST endpoint
       ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
       ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
       r = requests.put(ml_uri,
           params = params,
           auth = HTTPDigestAuth(ml_user, ml_pwd),
           data = payload,
           headers = headers)
 
       logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
开发者ID:michelderu,项目名称:ml-scrapy-pipeline,代码行数:29,代码来源:pipelines.py


示例4: fetch_data

def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
开发者ID:orithena,项目名称:sportswarnbot,代码行数:34,代码来源:bvb.py


示例5: call

def call():
    if world.results:
        return

    data = urllib.urlencode(world.params)
    req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
                          headers=world.header)
    fd = urllib2.urlopen(req)
    page = fd.read()

    fmt = world.params.get('format')
    if fmt not in ('html', 'xml', 'json', 'jsonv2'):
        fmt = 'xml' if world.requesttype == 'reverse' else 'html'
    pageinfo = fd.info()
    assert_equal('utf-8', pageinfo.getparam('charset').lower())
    pagetype = pageinfo.gettype()
    if fmt == 'html':
        assert_equals('text/html', pagetype)
        document, errors = tidy_document(page, 
                             options={'char-encoding' : 'utf8'})
        assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
        world.results = document
    elif fmt == 'xml':
        assert_equals('text/xml', pagetype)
        world.results = parseString(page).documentElement
    else:
        if 'json_callback' in world.params:
            func = world.params['json_callback']
            assert page.startswith(func + '(')
            assert page.endswith(')')
            page = page[(len(func)+1):-1]
            assert_equals('application/javascript', pagetype)
        else:
            assert_equals('application/json', pagetype)
        world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
开发者ID:mtmail,项目名称:test-nominatim,代码行数:35,代码来源:request_setup.py


示例6: test_xmlns_large_document_xml_corner_case

 def test_xmlns_large_document_xml_corner_case(self):
     # Test for a super weird edge case in Tidy that can cause it to return
     # the wrong required buffer size.
     body = '<span><span>A</span></span>' + 'A' * 7937
     html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
     doc, err = tidy_document(html, {'output-xml': 1})
     self.assertEqual(doc.strip()[-7:], "</html>")
开发者ID:GertBurger,项目名称:pytidylib,代码行数:7,代码来源:test_docs.py


示例7: nofoutofplacefeatures

def nofoutofplacefeatures(url):
	try:
	

	#	pdb.set_trace()

		if url[:4]=="http":
			r = requests.get(url)
		else:
			url="http://"+url
			r  = requests.get(url)

		#r = requests.get(url)
		data = r.text
		data2=r.content

		document, errors = tidy_document(data,
		  options={'numeric-entities':1})

		#print document
		#print errors
		#print "Number of Elements Out of Place : " + str(len(errors))
		return len(errors)
	except:
		pass
开发者ID:BelloHe,项目名称:Malicious_Website_Detection,代码行数:25,代码来源:realtestmodel.py


示例8: convert_to_html

def convert_to_html(filename):
    # Do the conversion with pandoc
    output = pypandoc.convert(filename, 'html')

    # Clean up with tidy...
    output, errors = tidy_document(output,  options={
        'numeric-entities': 1,
        'wrap': 80,
    })
    print(errors)

    # replace smart quotes.
    output = output.replace(u"\u2018", '&lsquo;').replace(u"\u2019", '&rsquo;')
    output = output.replace(u"\u201c", "&ldquo;").replace(u"\u201d", "&rdquo;")

    # write the output
    filename, ext = os.path.splitext(filename)
    filename = "{0}.html".format(filename)
    with open(filename, 'w') as f:
        # Python 2 "fix". If this isn't a string, encode it.
        if type(output) is not str:
            output = output.encode('utf-8')
        f.write(output)

    print("Done! Output written to: {}\n".format(filename))
开发者ID:bradmontgomery,项目名称:word2html,代码行数:25,代码来源:main.py


示例9: html2enml

def html2enml(html):
    # doc, err = tidy_fragment(

    doc, err = tidy_document(
        html,
        options={
            "output-xhtml": 1,
            "drop-proprietary-attributes": 1,
            "merge-divs": 1,
            "clean": 1
        }
    )

    root = fromstring(doc)

    # XXX dirty hack to circumvent a bug in lxml parser
    root = fromstring(etree.tostring(root))

    logging.debug(etree.tostring(root))

    # tidy_document returns a valid html document which means it usually contains html tag and proper body element
    root = root.find('body')
    if root is None:
        logging.warn("No body on this document")
        logging.warn(html)
        return "<div></div>"
    root.tag = 'div'

    root = remove_prohibited_elements(root)
    root = remove_prohibited_attributes(root)
    #FIXME Skipping dtd validation because of slow DTD creation speed
    # validate_dtd(html, f):

    return etree.tostring(root)
开发者ID:shurain,项目名称:archiver,代码行数:34,代码来源:enml.py


示例10: scrape

def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
开发者ID:satyadevi-nyros,项目名称:eracks,代码行数:60,代码来源:scrape_pages.py


示例11: __trading_years

 def __trading_years(self, instrument):
     re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
     document, errors = tidy_document(re.read())
     soup = BeautifulSoup(document)
     node = soup.find('select', attrs={'name':'year'})
     for option in node.findAll('option'):
         yield option.getText()
开发者ID:B-Rich,项目名称:dealer,代码行数:7,代码来源:sinads.py


示例12: process_response

    def process_response(self, request, response):
        if 'text/html' in response['Content-Type'] and response.content:
            document, errors = tidy_document(response.content)
            if errors:
                raise HTMLValidationError(errors)

        return response
开发者ID:nosamanuel,项目名称:django-test-validator,代码行数:7,代码来源:middleware.py


示例13: _tidysrc

    def _tidysrc(self,data,srccode):
        """tidy scribe the html src"""

        try:
            from tidylib import tidy_document
            BASE_OPTIONS = {
    "output-xhtml": 1,     # XHTML instead of HTML4
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
    "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    "char-encoding":'utf-8',
    "input-encoding":srccode,
    "output-encoding":'utf-8',
    }
            if not isinstance(data, unicode):                
                try:
                    data = data.decode(srccode)
                except:
                    pass
            doc, errors = tidy_document(data,options={'numeric-entities':1})
            return doc
        except:
            return data
开发者ID:adam139,项目名称:xsgs.theme,代码行数:26,代码来源:homepage.py


示例14: dynamic_test_method

 def dynamic_test_method(self):
     """this function name doesn't matter much, it can start with `test`,
     but we're going to rename it dynamically below"""
     reportURLstring = '/report?reportname=' + reportItem.metadata['action']
     response=self._my_app.get(reportURLstring)
     code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
     self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
开发者ID:woodenshoe,项目名称:infoshopkeeper,代码行数:7,代码来源:test_unittest.py


示例15: getMenu

def getMenu():
    storeFile = open("list.txt","r")
    txt = storeFile.read()
    storeFile.close()
    
    list=txt.split('\n\n\n')
    

 #   print list
    
    for store in list:    
#        print store
        rest = store.split('\n')
        if len(rest)!=3:
            break
        try:
            url=baseUrl+rest[2] +'menu'
            print url
            res=urlopen(url)
            html=res.read()    
         
            options = {'output-encoding':'utf8', 'output-xhtml':1 }
            document,errors = tidy_document(html,options)   
            
            filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
            saveFile = open(filepath,"w")
            saveFile.write(document)
            saveFile.close()
            print filepath
        except :
            print "skip:"+url
开发者ID:blueskywalker,项目名称:menuReview,代码行数:31,代码来源:getMenu.py


示例16: get_employees

def get_employees(lastname, firstname):
    payload = { 'find' : lastname }
    res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload)
    if res.status_code == 200:
        persons = [ ]
        
        document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1})
        tree = ET.fromstring(strip_ns(document))
        
        try:
            filename = posixpath.basename(urlparse.urlsplit(res.url).path)
            if filename == 'lecturer.asp':
                fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip()
                unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip()
            
                persons.append(fullname)

            elif filename == 'lecturerlist.asp':
                links = [ ]
                for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'):
                    if cell is not None:
                        fullname = cell.text.strip()
                        persons.append(fullname)
            else:
                raise Exception
        except:
            print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname)
        
        return persons
开发者ID:BenediktAllendorf,项目名称:snippets,代码行数:29,代码来源:sciebo_shares.py


示例17: sanitize

def sanitize(note):
	debug('Sanitizing note content...', 2)

	if get_setting('evernote/sanitize/@applytemplate') == 'True':
		with open(get_setting('evernote/sanitize/template/text()'), 'r') as file:
			template = file.read()
			template = template.replace('{content}', note['content'])
			
		note['content'] = transform(template)
		
		preservedElements = []
		preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()')
		preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',')
		for preserve in preserves:
			matches = re.findall(preservePattern.format(preserve), note['content'])
			for match in matches:
				placeholder = '{%s}' % uuid.uuid4().hex
				preservedElements.append({'placeholder': placeholder, 'element': match})
				note['content'] = note['content'].replace(match, placeholder, 1)
	
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content'])
		note['content'] = note['content'].encode('utf-8', errors='ignore')
		(note['content'], errors) = tidy_document(note['content'])

		for element in preservedElements:
			note['content'] = note['content'].replace(element['placeholder'], element['element'])
	
	if note['title'] != None:
		note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
	else:
		note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')
开发者ID:arychj,项目名称:Notilitus,代码行数:33,代码来源:notilitus.py


示例18: cleanUpHTML

def cleanUpHTML(html, options=None):
    import tidylib
    tidylib.BASE_OPTIONS = {}

    default_options = { 
                        "force-output" : 1,
                        "output-xhtml" : 1,
                        "doctype" : "strict",
                        "drop-empty-paras": 1,
                        "output-encoding" : "utf8",
                        "clean": 1,
                        "bare": 1
                       }
    if options:
        default_options.extend(options)

    # first fix up footnotes so that HTMLTidy won't ditch them
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    footnoteFixer(soup) #html)
    stripEmptyParagraphs(soup)
    
    html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options)
    
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')])
    
    return soup.prettify(encoding=None), errors
开发者ID:kollivier,项目名称:brightwriter,代码行数:27,代码来源:htmlutils.py


示例19: html_clean

    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
开发者ID:mhagander,项目名称:pgarchivesweb,代码行数:27,代码来源:parser.py


示例20: test_doc_with_unclosed_tag

 def test_doc_with_unclosed_tag(self):
     h = "<p>hello"
     expected = DOC % '''<p>
   hello
 </p>'''
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:7,代码来源:DocsTest.py



注:本文中的tidylib.tidy_document函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tidylib.tidy_fragment函数代码示例发布时间:2022-05-27
下一篇:
Python tidy.parseString函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap