本文整理汇总了Java中net.htmlparser.jericho.OutputDocument类的典型用法代码示例。如果您正苦于以下问题:Java OutputDocument类的具体用法?Java OutputDocument怎么用?Java OutputDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
OutputDocument类属于net.htmlparser.jericho包,在下文中一共展示了OutputDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: processPage
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
protected String processPage( PathOrigin baseDir, String pagePath ) throws IOException {
long start = System.currentTimeMillis();
InputStream file = null;
try {
file = baseDir.getReader( getRepo() ).getFileInputStream( pagePath );
Source html = new Source( file );
OutputDocument outDoc = new OutputDocument( html );
// transform
modifyDocument( html, baseDir, outDoc );
return outDoc.toString();
} finally {
IOUtils.closeQuietly( file );
if ( log.isDebugEnabled() ) {
log.debug( String.format( "processPage for %s took %dms", pagePath, System.currentTimeMillis() - start ) );
}
}
}
开发者ID:webdetails,项目名称:cte,代码行数:20,代码来源:ProcessedHtmlPage.java
示例2: replaceUrlAttribute
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
protected int replaceUrlAttribute ( Iterable<StartTag> tags, final String pathAttribute, PathOrigin baseDir, OutputDocument doc ) {
int count = 0;
for ( StartTag tag : tags ) {
Attributes attr = tag.parseAttributes();
String path = attr.getValue( pathAttribute );
if ( shouldProcessPath( path ) ) {
String newPath = processPath( baseDir, path, getUrlProvider() );
if ( log.isTraceEnabled() ) { //TODO: trace
log.trace( String.format( "replaced: in %[email protected]%s \"%s\" --> \"%s\"", tag.getName(), pathAttribute, path, newPath ) );
}
doc.replace( attr, true ).put( pathAttribute, newPath );
count++;
}
}
return count;
}
开发者ID:webdetails,项目名称:cte,代码行数:17,代码来源:ProcessedHtmlPage.java
示例3: strip
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
* Retira tags indesejadas
*
* @param html
* @return
*/
public String strip(String html) {
if (html == null)
return "";
Source source = new Source(html);
source.fullSequentialParse();
OutputDocument output = new OutputDocument(source);
List<Tag> tags = source.getAllTags();
for (Tag tag : tags) {
if (processTag(tag, output)) {
tag.setUserData(VALID_MARKER);
} else {
output.remove(tag);
}
// reencodeTextSegment(source, output, pos, tag.getBegin());
}
// reencodeTextSegment(source, output, pos, source.getEnd());
return output.toString();
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:28,代码来源:HtmlStripperDiscussion.java
示例4: strip
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
* Retira tags indesejadas
*
* @param html
* @return
*/
public String strip(String html) {
if (html == null)
return "";
Source source = new Source(html);
source.fullSequentialParse();
OutputDocument output = new OutputDocument(source);
List<Tag> tags = source.getAllTags();
int pos = 0;
for (Tag tag : tags) {
if (processTag(tag, output)) {
tag.setUserData(VALID_MARKER);
} else {
output.remove(tag);
}
reencodeTextSegment(source, output, pos, tag.getBegin());
pos = tag.getEnd();
}
reencodeTextSegment(source, output, pos, source.getEnd());
return output.toString();
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:27,代码来源:HtmlStripper.java
示例5: printHTMLPage
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
public void printHTMLPage(Source source)
throws UnsupportedEncodingException, IOException {
List<StartTag> list = source.getAllStartTags(HTMLElementName.STYLE);
Iterator<StartTag> iterator = list.iterator();
String text = "";
while (iterator.hasNext()) {
final StartTag tag = iterator.next();
final Segment s = new Segment(source, tag.getEnd(), tag
.getElement().getEndTag().getBegin());
text += s.toString();
}
Vector<ReplaceRight> rights = Style.getStyles(text, styles, counter);
// генерація сторінки з оновленими стилями.
list = source.getAllStartTags();
iterator = list.iterator();
StartTag startTag = null;
while (iterator.hasNext()) {
final StartTag st = iterator.next();
if (HTMLElementName.BODY.equals(st.getName())) {
startTag = st;
break;
}
}
if (startTag == null)
return;
final StartTag body = startTag;
final OutputDocument document = new OutputDocument(source);
while (iterator.hasNext()) {
startTag = iterator.next();
replaceAttrs(startTag, document, rights);
}
OutputStreamWriter writer = new OutputStreamWriter(this.out, "UTF-8");
document.writeTo(writer, body.getEnd(), body.getElement().getEndTag()
.getBegin());
writer.flush();
}
开发者ID:Vitaliy-Yakovchuk,项目名称:ramus,代码行数:41,代码来源:Out.java
示例6: modifyDocument
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
* Updates relative source attributes to externally accessible abs paths
* @param html the document
* @param baseDir html location
* @param out processed document
*/
protected void modifyDocument( Source html, PathOrigin baseDir, OutputDocument out ) {
replaceUrlAttribute( html.getAllStartTags( HTMLElementName.LINK ), "href", baseDir, out );
replaceUrlAttribute( html.getAllStartTags( HTMLElementName.SCRIPT ), "src", baseDir, out );
replaceUrlAttribute( html.getAllStartTags( HTMLElementName.IMG ), "src", baseDir, out );
//int insertPos = html.getFirstElement( HTMLElementName.HEAD ).getEndTag().getBegin();
//out.insert( insertPos, getCodeSnippet( getBackendAssignments( getUrlProvider() ) ) );
}
开发者ID:webdetails,项目名称:cte,代码行数:14,代码来源:ProcessedHtmlPage.java
示例7: sanitise
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static String sanitise(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements) {
Source source=new Source(pseudoHTML);
source.fullSequentialParse();
OutputDocument outputDocument=new OutputDocument(source);
List<Tag> tags=source.getAllTags();
int pos=0;
for (Tag tag : tags) {
if (processTag(tag,outputDocument)) {
tag.setUserData(VALID_MARKER);
} else {
if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
if(tag.getName().equalsIgnoreCase("style")){
Tag nextTag=tag.getNextTag();
int endPos=0;
if(nextTag!=null){
endPos=nextTag.getBegin()-1;
}else{
endPos=source.getEnd();
}
outputDocument.remove(tag.getBegin(),endPos);
}else{
outputDocument.remove(tag);
}
}
//reencodeTextSegment(source,outputDocument,pos,tag.getBegin(),formatWhiteSpace);
pos=tag.getEnd();
}
//reencodeTextSegment(source,outputDocument,pos,source.getEnd(),formatWhiteSpace);
return outputDocument.toString();
}
开发者ID:trackplus,项目名称:Genji,代码行数:32,代码来源:HTMLSanitiser.java
示例8: processTag
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static boolean processTag(Tag tag, OutputDocument outputDocument) {
String elementName=tag.getName();
if (!VALID_ELEMENT_NAMES.contains(elementName)){
//System.out.println("Not ok tag:!"+elementName+"!");
return false;
}
if (tag.getTagType()==StartTagType.NORMAL) {
Element element=tag.getElement();
if (elementName==HTMLElementName.THEAD && !isValidTbodyTHeadTag(tag)) return false;
if (elementName==HTMLElementName.TBODY && !isValidTbodyTHeadTag(tag)) return false;
if (elementName==HTMLElementName.TR && !isValidTRTag(tag)) return false;
if (elementName==HTMLElementName.TD && !isValidTDTHTag(tag)) return false;
if (elementName==HTMLElementName.TH && !isValidTDTHTag(tag)) return false;
if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
if (element.getEndTag()==null) return false; // reject start tag if its required end tag is missing
} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
if (elementName==HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags
if (element.getEndTag()==null) outputDocument.insert(element.getEnd(),getEndTagHTML(elementName)); // insert optional end tag if it is missing
}
outputDocument.replace(tag,getStartTagHTML(element.getStartTag()));
} else if (tag.getTagType()==EndTagType.NORMAL) {
if (tag.getElement()==null) return false; // reject end tags that aren't associated with a start tag
if (elementName==HTMLElementName.THEAD && !isValidTbodyTHeadTag(tag)) return false;
if (elementName==HTMLElementName.TBODY && !isValidTbodyTHeadTag(tag)) return false;
if (elementName==HTMLElementName.TR && !isValidTRTag(tag)) return false;
if (elementName==HTMLElementName.TD && !isValidTDTHTag(tag)) return false;
if (elementName==HTMLElementName.TH && !isValidTDTHTag(tag)) return false;
if (elementName==HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags
outputDocument.replace(tag,getEndTagHTML(elementName));
} else {
return false; // reject abnormal tags
}
return true;
}
开发者ID:trackplus,项目名称:Genji,代码行数:35,代码来源:HTMLSanitiser.java
示例9: reencodeTextSegment
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private static void reencodeTextSegment(Source source, OutputDocument outputDocument, int begin, int end, boolean formatWhiteSpace) {
if (begin>=end) return;
Segment textSegment=new Segment(source,begin,end);
String decodedText=CharacterReference.decode(textSegment);
String encodedText=formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) : CharacterReference.encode(decodedText);
outputDocument.replace(textSegment,encodedText);
}
开发者ID:trackplus,项目名称:Genji,代码行数:8,代码来源:HTMLSanitiser.java
示例10: processTag
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private boolean processTag(Tag tag, OutputDocument output) {
String elementName = tag.getName().toLowerCase();
if (!allowedTags.contains(elementName))
return false;
if (tag.getTagType() == StartTagType.NORMAL) {
Element element = tag.getElement();
if (HTMLElements.getEndTagRequiredElementNames().contains(
elementName)) {
if (element.getEndTag() == null)
return false; // reject start tag if its required end tag is
// missing
} else if (HTMLElements.getEndTagOptionalElementNames().contains(
elementName)) {
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
if (element.getEndTag() == null)
output.insert(element.getEnd(), getEndTagHTML(elementName)); // insert
// optional
// end
// tag
// if
// it
// is
// missing
}
output.replace(tag, getStartTagHTML(element.getStartTag()));
} else if (tag.getTagType() == EndTagType.NORMAL) {
if (tag.getElement() == null)
return false; // reject end tags that aren't associated with a
// start tag
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
output.replace(tag, getEndTagHTML(elementName));
} else {
return false; // reject abnormal tags
}
return true;
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:39,代码来源:HtmlStripperDiscussion.java
示例11: reencodeTextSegment
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private void reencodeTextSegment(Source source, OutputDocument output, int begin, int end) {
if (begin >= end)
return;
Segment textSegment = new Segment(source, begin, end);
String decodedText = CharacterReference.decode(textSegment);
String encodedText = CharacterReference.encode(decodedText);
output.replace(textSegment, encodedText);
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:9,代码来源:HtmlStripper.java
示例12: processTag
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private boolean processTag(Tag tag, OutputDocument output) {
String elementName = tag.getName().toLowerCase();
if (!allowedTags.contains(elementName))
return false;
if (tag.getTagType() == StartTagType.NORMAL) {
Element element = tag.getElement();
if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
if (element.getEndTag() == null)
return false; // reject start tag if its required end tag is
// missing
} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
if (element.getEndTag() == null)
// insert optional end tag if it is missing
output.insert(element.getEnd(), getEndTagHTML(elementName));
}
output.replace(tag, getStartTagHTML(element.getStartTag()));
} else if (tag.getTagType() == EndTagType.NORMAL) {
if (tag.getElement() == null)
return false; // reject end tags that aren't associated with a
// start tag
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
output.replace(tag, getEndTagHTML(elementName));
} else {
return false; // reject abnormal tags
}
return true;
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:31,代码来源:HtmlStripper.java
示例13: reencodeTextSegment
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
private void reencodeTextSegment(Source source, OutputDocument output,
int begin, int end) {
if (begin >= end)
return;
Segment textSegment = new Segment(source, begin, end);
String decodedText = CharacterReference.decode(textSegment);
String encodedText = CharacterReference.encode(decodedText);
output.replace(textSegment, encodedText);
}
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:10,代码来源:HtmlStripper.java
示例14: realWriteWithHTMLUpdate
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
public void realWriteWithHTMLUpdate() throws IOException {
flush();
ByteArrayOutputStream out = (ByteArrayOutputStream) this.out;
Source source = new Source(new String(out.toByteArray(), "UTF-8"));
source.fullSequentialParse();
List<StartTag> list = source.getAllStartTags("html");
if (list.size() == 0) {
realWrite();
return;
}
this.out = outputStream;
OutputStreamWriter writer = new OutputStreamWriter(this.out, "UTF-8");
OutputDocument document = new OutputDocument(source);
StringBuffer style = new StringBuffer();
if (this.styles.size() > 0) {
for (Style style2 : this.styles)
style.append(style2.toString());
}
List<StartTag> h = source.getAllStartTags("style");
if (h.size() > 0) {
document.insert(h.get(0).getElement().getEndTag().getBegin(), style);
} else {
style.insert(0, "\n<style>\n");
style.append("</style>\n");
h = source.getAllStartTags("head");
if (h.size() > 0) {
document.insert(h.get(0).getElement().getEndTag().getBegin(),
style);
} else {
style.insert(0, "\n<head>\n");
style.append("</head>\n");
document.insert(h.get(0).getElement().getEndTag().getBegin(),
style);
}
}
document.writeTo(writer);
writer.flush();
}
开发者ID:Vitaliy-Yakovchuk,项目名称:ramus,代码行数:46,代码来源:Out.java
示例15: removeNotAllowedTags
import net.htmlparser.jericho.OutputDocument; //导入依赖的package包/类
/**
* Serduszko dla Bartka od Kasi <3
* @param htmlFragment
* @param docUri
* @return
*/
private String removeNotAllowedTags(String htmlFragment, URI docUri) {
Source source = new Source(htmlFragment);
OutputDocument outputDocument = new OutputDocument(source);
List<Element> elements = source.getAllElements();
for (Element element : elements) {
Attributes attrs = element.getAttributes();
Map<String, String> attrsUpdate = outputDocument.replace(attrs, true);
if (!element.getName().contains("a")) {
attrsUpdate.clear();
} else {
if (attrsUpdate.get("href")!=null) {
String link = attrsUpdate.get("href");
if (!link.contains("http")) {
URI documentUri = docUri;
URI anchorUri;
try {
anchorUri = new URI(link);
URI result = documentUri.resolve(anchorUri);
attrsUpdate.put("href", result.toString());
} catch (URISyntaxException e) {
outputDocument.remove(element);
}
}
}
}
if (NOT_ALLOWED_HTML_TAGS.contains(element.getName())) {
Segment content = element.getContent();
if (element.getName() == "script"
|| element.getName() == "style"
|| element.getName() == "form") {
outputDocument.remove(content);
}
outputDocument.remove(element.getStartTag());
if (!element.getStartTag().isSyntacticalEmptyElementTag()) {
outputDocument.remove(element.getEndTag());
}
}
}
String out = outputDocument.toString();
out = out.replaceAll("\\n", "");
out = out.replaceAll("\\t", "");
return out;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:58,代码来源:HtmlArticleExtractor.java
注:本文中的net.htmlparser.jericho.OutputDocument类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论