• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python tidylib.tidy_fragment函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tidylib.tidy_fragment函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_fragment函数的具体用法?Python tidy_fragment怎么用?Python tidy_fragment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tidy_fragment函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_frag_with_entity

 def test_frag_with_entity(self):
     h = "é"
     expected = "é"
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
     
     expected = "é"
     doc, err = tidy_fragment(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:9,代码来源:FragsTest.py


示例2: parse_book_file

def parse_book_file(href, book):
    block = {}
    book_tree = lxml.html.parse(join(books_dir, href), parser)
    if not 'page_count' in book:
        td = book_tree.xpath(
                "//td[descendant::*[contains(text(), '{}')]]".format(
                    book['title'])
                )
        if len(td):
            td = td[0]
            page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
            if len(page_info):
                book['page_count'] = patterns[0][1].search(
                        tostring(page_info[0], encoding='unicode')).groups()[0]

    block['annotation'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Аннотация')]]")
    block['contents'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Содержание')]]")
    for key in block:
        if len(block[key]):
            mark = block[key][-1]
            book[key] = ""
            for element in mark.itersiblings():
                if element.tag == "table":
                    break
                drop_a(element)
                remove_attr(element)
                book[key] += tostring(element, encoding='unicode')
            book[key] = tidy_fragment(clean(book[key]))[0]
    return book
开发者ID:a-iv,项目名称:practica.ru,代码行数:31,代码来源:oldsite_parser.py


示例3: test_frag_with_unclosed_tag

    def test_frag_with_unclosed_tag(self):
        h = "<p>hello"
        expected = '''<p>
  hello
</p>'''
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:7,代码来源:FragsTest.py


示例4: sanitize_html

def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )
开发者ID:praekelt,项目名称:ummeli,代码行数:60,代码来源:vlive_tags.py


示例5: clean

def clean( html ):
    if not html:
        return html
    clean = bleach.clean( html, tags = local_config.TAG_WHITELIST, attributes = local_config.ATTRIBUTE_WHITELIST )
    # catches some additional problems
    tidy, warnings = tidylib.tidy_fragment( clean )
    return tidy
开发者ID:mrwonko,项目名称:homepage,代码行数:7,代码来源:just_sanitize.py


示例6: link_title_uid_txt

def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
开发者ID:immissile,项目名称:42qu_github_mirror,代码行数:31,代码来源:rss_update.py


示例7: get_article_text

    def get_article_text(self, body):
        """
        Gets the article main text
        :param body:
        :return:
        """
        raw_article_body = body.find("div", {"class": "article-body"})

        article_body_no_html = raw_article_body

        if article_body_no_html is not None:
            article_body_no_html = article_body_no_html.get_text()
            article_body_no_html = self.gremlin_zapper.zap_string(article_body_no_html)

        if raw_article_body is not None:
            self.zap_tag_contents(raw_article_body)
            article_body = ''
            for item in raw_article_body.contents:
                article_body += str(item)
        else:
            article_body = ''

        article_body, errors = tidy_fragment(article_body, options={'numeric-entities': 1})

        return article_body, article_body_no_html
开发者ID:ucsc,项目名称:slug-news,代码行数:25,代码来源:scraper.py


示例8: test_frag_with_unicode_subclass

    def test_frag_with_unicode_subclass(self):
        class MyUnicode(utype):
            pass

        h = MyUnicode("unicode string ß")
        expected = h
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:8,代码来源:FragsTest.py


示例9: object_for_typepad_object

def object_for_typepad_object(tp_obj):
    try:
        obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id)
    except Object.DoesNotExist:
        pass
    else:
        log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id)
        return False, obj

    log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name)

    author = account_for_typepad_user(tp_obj.author)
    body = tp_obj.rendered_content
    if not body and tp_obj.content:
        if tp_obj.text_format == 'html_convert_linebreaks':
            body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n'))
        else:
            body = tp_obj.content
    if body:
        body, errors = tidy_fragment(body)
    else:
        body = ''

    obj = Object(
        service='typepad.com',
        foreign_id=tp_obj.url_id,
        render_mode='mixed',
        title=tp_obj.title,
        body=body,
        time=tp_obj.published,
        permalink_url=tp_obj.permalink_url,
        author=author,
    )

    if getattr(tp_obj, 'in_reply_to', None) is not None:
        # This post is in reply, so we don't care if our referent was
        # really a share. Be transitively in reply to the shared obj.
        really_a_share, obj.in_reply_to = object_for_typepad_object(tp_obj.in_reply_to)
    elif getattr(tp_obj, 'reblog_of', None) is not None:
        # Assets are public so it's okay if we use an anonymous typd here.
        t = typd.TypePad(endpoint='http://api.typepad.com/')
        reblog_of = t.assets.get(tp_obj.reblog_of.url_id)

        really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of)
        remove_reblog_boilerplate_from_obj(obj)
        if not obj.body:
            return True, obj.in_reply_to
    elif getattr(tp_obj, 'reblog_of_url', None) is not None:
        reblog_url = tp_obj.reblog_of_url
        try:
            in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url)
        except leapfrog.poll.embedlam.RequestError, exc:
            in_reply_to = None
        except ValueError, exc:
            in_reply_to = None
            log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id)
            log.exception(exc)
开发者ID:apparentlymart,项目名称:leapfrog,代码行数:57,代码来源:typepad.py


示例10: tidy_html

def tidy_html(html):
    """
    Process an input string containing HTML and return a tuple (xhtml,
    errors, warnings) containing the output of tidylib and lists of
    validation errors and warnings.

    Input must be unicode.
    Output will be valid XHTML.
    """
    if not isinstance(html, unicode):
        raise ValueError("tidyhtml must be called with a Unicode string!")

    warnings = list()

    # First, deal with embedded control codes:
    html, sub_count = CONTROL_CHAR_RE.subn(" ", html)
    if sub_count:
        warnings.append("Stripped %d control characters from body: %s" % (
            sub_count,
            set(ord(i) for i in CONTROL_CHAR_RE.findall(html))
        ))

    html, messages = tidylib.tidy_fragment(
        html.strip(),
        {
            "char-encoding":               "utf8",
            "clean":                        False,
            "drop-empty-paras":             False,
            "drop-font-tags":               True,
            "drop-proprietary-attributes":  False,
            "fix-backslash":                True,
            "indent":                       True,
            "output-xhtml":                 True,
        }
    )

    messages = filter(None, (l.strip() for l in messages.split("\n") if l))

    # postprocess warnings to avoid HTML fragments being reported as lacking
    # doctype and title:
    errors = list()
    warnings = list()

    for msg in messages:
        if "Warning: missing <!DOCTYPE> declaration" in msg:
            continue
        if "Warning: inserting missing 'title' element" in msg:
            continue
        if "Warning: inserting implicit <body>" in msg:
            continue

        if "Error:" in msg:
            errors.append(msg)
        else:
            warnings.append(msg)

    return html, errors, warnings
开发者ID:akaihola,项目名称:feincms,代码行数:57,代码来源:tidy.py


示例11: cleanupText

def cleanupText(text):
    """This method cleans up the text of the report using libtidy"""
    # tidylib options
    options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0)
    # remove html entities from the text
    ubody_text = unescape(text)
    # clean up xhtml using tidy
    aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False)
    # tidylib returns a <tidy.lib._Document object>
    return str(aftertidy)
开发者ID:BenoitTalbot,项目名称:bungeni-portal,代码行数:10,代码来源:downloaddocument.py


示例12: html

 def html(self, string):
     """Parses HTML"""
     if "allow_html" not in INGIniousConfiguration or INGIniousConfiguration["allow_html"] == False:
         raise Exception("HTML is not allowed")
     elif INGIniousConfiguration["allow_html"] == "tidy":
         import tidylib
         out, dummy = tidylib.tidy_fragment(string)
         return out
     else:
         return string
开发者ID:GuillaumeDerval,项目名称:INGInious,代码行数:10,代码来源:parsable_text.py


示例13: html2xhtml

def html2xhtml(html,**options):
    options.update(doctype='omit')
    options.update(show_warnings=0)
    options.update(indent=0)
    options.update(output_xml=1)
    document, errors = tidy_fragment(html,options=options)
    if errors:
        #~ raise Exception(repr(errors))
        raise Exception("Errors while processing %s\n==========\n%s" % (html,errors))
    return document
开发者ID:MaxTyutyunnikov,项目名称:lino,代码行数:10,代码来源:html2xhtml.py


示例14: fix_open_tags

def fix_open_tags(source):
    """ Fixes missing tags in html fragments. """
    if not source:
        return source

    fixedhtml, errors = tidy_fragment(source)
    if settings.DEBUG and errors:
        errors = filter_tidylib_errors(errors)
        if errors:
            log.debug('Tidylib errors:\n{}'.format(errors))
    return fixedhtml
开发者ID:welbornprod,项目名称:wp_site,代码行数:11,代码来源:htmltools.py


示例15: POST

        def POST(self):
            """ POST request """
            web.header('Content-Type', 'application/json')

            post_input = web.data()

            try:
                decoded_input = json.loads(post_input)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode POST</p>"})

            if "xqueue_body" not in decoded_input:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: no xqueue_body in POST</p>"})
            try:
                edx_input = json.loads(decoded_input["xqueue_body"])
                taskid = json.loads(edx_input["grader_payload"])["tid"]
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode JSON</p>"})

            try:
                task = course.get_task(taskid)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: unknown task {}</p>".format(taskid)})

            if not task.input_is_consistent(edx_input):
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: input not consistent with task</p>"})

            try:
                job_return = job_manager_sync.new_job(task, edx_input, "Plugin - EDX")
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error while grading submission</p>"})

            try:
                text = ""
                if "text" in job_return:
                    text = job_return["text"]
                if "problems" in job_return:
                    for prob in job_return["problems"]:
                        text += "<br/><h4>" + job_return["task"].get_problems()[prob].get_name() + "</h4>" + job_return["problems"][prob]

                score = (1 if job_return["result"] == "success" else 0)
                if "score" in job_return:
                    score = job_return["score"]

                import tidylib

                out, dummy = tidylib.tidy_fragment(text, options={'output-xhtml': 1, 'enclose-block-text': 1, 'enclose-text': 1})
                return json.dumps({"correct": (True if (job_return["result"] == "success") else None), "score": score, "msg": out})
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error converting submission result</p>"})
开发者ID:jonsan21,项目名称:INGInious,代码行数:50,代码来源:edx.py


示例16: normalize

def normalize(text):
    """ Normalize whitespace for a string of html using tidylib. """
    output, errors = tidylib.tidy_fragment(text, options={
                                    'drop_empty_paras':0,
                                    'fix_backslash':0,
                                    'fix_bad_comments':0,
                                    'fix_uri':0,
                                    'join_styles':0,
                                    'lower_literals':0,
                                    'merge_divs':0,
                                    'output_xhtml':1,
                                    'quote_ampersand':0,
                                    'newline':'LF'})
    return output
开发者ID:manikanta-kumar-allakki,项目名称:polls-india,代码行数:14,代码来源:__init__.py


示例17: html2xhtml

 def html2xhtml(html, **options):
     options.update(doctype='omit')
     options.update(show_warnings=0)
     options.update(indent=0)
     # options.update(output_xml=1)
     options.update(output_xhtml=1)
     document, errors = tidy_fragment(html, options=options)
     if errors:
         #~ raise Exception(repr(errors))
         raise Exception("Errors while processing %s\n==========\n%s" %
                         (html, errors))
     # if document.startswith(WRAP_BEFORE):
     #     document = document[len(WRAP_BEFORE):]
     #     document = document[:-15]
     return document.strip()
开发者ID:zhuangyan,项目名称:lino,代码行数:15,代码来源:html2xhtml.py


示例18: __init__

    def __init__(self, op_html):
        """
        Intializes this option with HTML. The HTML is validated before initializing the option.
        The input HTML should be a snippet and not contain the `html`, `head`, `title`, nor `body` tags.
        Throws an HTMLValidationException if the validation produces errors.

        :param op_html: The string representation of the option HTML.
        :return:
        """

        document, errors = tidy_fragment("<!DOCTYPE html><html><head><title></title><body>%s</body></html>" % op_html)
        # python is stupid
        if len(errors) > 1:
            print errors
            raise HTMLValidationException()
        else:
            Option.__init__(self, op_html)
开发者ID:LegoStormtroopr,项目名称:SMPy,代码行数:17,代码来源:options.py


示例19: mytidy

def mytidy(content):
    BASE_OPTIONS = {
        "output-xhtml": 0,     # XHTML instead of HTML4
        "indent": 1,           # Pretty; not too much of a performance hit
        "indent-spaces":4,
        "tab-size":4,
        "tidy-mark": 0,        # No tidy meta tag in output
        "wrap": 0,             # No wrapping
        "alt-text": "",        # Help ensure validation
        "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
        "force-output": 1,     # May not get what you expect but you will get something
        "char-encoding":'utf8',
        "input-encoding":'utf8',
        "output-encoding":'utf8',
        }
    content = tidy_fragment(content, BASE_OPTIONS)
    return content[0]
开发者ID:bdrydyk,项目名称:wurdig,代码行数:17,代码来源:tidy_helper.py


示例20: normalize

def normalize(text):
    """ Normalize whitespace for a string of html using tidylib. """
    output, errors = tidylib.tidy_fragment(
        text,
        options={
            "drop_empty_paras": 0,
            "fix_backslash": 0,
            "fix_bad_comments": 0,
            "fix_uri": 0,
            "join_styles": 0,
            "lower_literals": 0,
            "merge_divs": 0,
            "output_xhtml": 1,
            "quote_ampersand": 0,
            "newline": "LF",
        },
    )
    return output
开发者ID:Grassflying2,项目名称:Python-Markdown,代码行数:18,代码来源:__init__.py



注:本文中的tidylib.tidy_fragment函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tifffile.imread函数代码示例发布时间:2022-05-27
下一篇:
Python tidylib.tidy_document函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap