本文整理汇总了Java中de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser类的典型用法代码示例。如果您正苦于以下问题:Java MediaWikiParser类的具体用法?Java MediaWikiParser怎么用?Java MediaWikiParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
MediaWikiParser类属于de.tudarmstadt.ukp.wikipedia.parser.mediawiki包,在下文中一共展示了MediaWikiParser类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args) throws WikiApiException {
//db connection settings
DatabaseConfiguration dbConfig = new DatabaseConfiguration();
dbConfig.setDatabase("DATABASE");
dbConfig.setHost("HOST");
dbConfig.setUser("USER");
dbConfig.setPassword("PASSWORD");
dbConfig.setLanguage(Language.english);
//initialize a wiki
Wikipedia wiki = new Wikipedia(dbConfig);
MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english);
MediaWikiParser parser = pf.createParser();
//get the page 'House_(disambiguation)'
ParsedPage pp = parser.parse(wiki.getPage("House_(disambiguation)").getText());
int i = 1;
// print out all nested lists of the page
for(NestedList nl : pp.getNestedLists()){
System.out.println(i + ": \n" + outputNestedList(nl,0));
i++;
}
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:27,代码来源:T6_NestedLists.java
示例2: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args){
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
// Link Context (return 1 token left, 2 token right of the link)
for (Link link : pp.getLinks()) {
System.out.println(
link.getContext(1, 0) + "<" +
link.getText().toString().toUpperCase() + ">" +
link.getContext(0, 2)
);
}
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:20,代码来源:T3_LinkContexts.java
示例3: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
//get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
//get the sections
for(Section section : pp.getSections()) {
System.out.println("section : " + section.getTitle());
System.out.println(" nr of paragraphs : " + section.nrOfParagraphs());
System.out.println(" nr of tables : " + section.nrOfTables());
System.out.println(" nr of nested lists : " + section.nrOfNestedLists());
System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists());
}
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:24,代码来源:T1_SimpleParserDemo.java
示例4: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
/**
* Prints the targets of the internal links found in the page <i>Germany</i>.
* @param args
* @throws WikiApiException
*/
public static void main(String[] args) throws WikiApiException {
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
// only the links to other Wikipedia language editions
for (Link language : pp.getLanguages()) {
System.out.println(language.getTarget());
}
//get the internal links of each section
for (Section section : pp.getSections()){
System.out.println("Section: " + section.getTitle());
for (Link link : section.getLinks(Link.type.INTERNAL)) {
System.out.println(" " + link.getTarget());
}
}
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:30,代码来源:T2_InternalLinks.java
示例5: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main( String[] argv ) throws Exception{
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// set up an individually parametrized MediaWikiParser
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.getImageIdentifers().add("Image");
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse( documentText );
String outFileName = "htmlFileDemo.html";
HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp));
System.out.println("Writing output to file: " + outFileName);
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:18,代码来源:HtmlFileDemo.java
示例6: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args){
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
// Link Context (return 1 token left, 2 token right of the link)
for (Link link : pp.getLinks()) {
System.out.println(
link.getContext(1, 0) + "<" +
link.getText().toString().toUpperCase() + ">" +
link.getContext(0, 2)
);
}
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:20,代码来源:T3_LinkContexts.java
示例7: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
/**
* @param args
* @throws WikiApiException
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
//get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
//get the sections
for(Section section : pp.getSections()) {
System.out.println("section : " + section.getTitle());
System.out.println(" nr of paragraphs : " + section.nrOfParagraphs());
System.out.println(" nr of tables : " + section.nrOfTables());
System.out.println(" nr of nested lists : " + section.nrOfNestedLists());
System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists());
}
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:25,代码来源:T1_SimpleParserDemo.java
示例8: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
/**
* Prints the targets of the internal links found in the page <i>Germany</i>.
* @param args
* @throws WikiApiException
*/
public static void main(String[] args) throws WikiApiException {
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(documentText);
// only the links to other Wikipedia language editions
for (Link language : pp.getLanguages()) {
System.out.println(language.getTarget());
}
//get the internal links of each section
for (Section section : pp.getSections()){
System.out.println("Section: " + section.getTitle());
for (Link link : section.getLinks(Link.type.INTERNAL)) {
System.out.println(" " + link.getTarget());
}
}
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:30,代码来源:T2_InternalLinks.java
示例9: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main( String[] argv ) throws Exception{
// load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt")
String documentText = TestFile.getFileText();
// set up an individually parametrized MediaWikiParser
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.getImageIdentifers().add("Image");
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse( documentText );
String outFileName = "htmlFileDemo.html";
HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp));
System.out.println("Writing output to file: " + outFileName);
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:18,代码来源:HtmlFileDemo.java
示例10: getSectionsWithJWPL
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
/**
* Parses the Talk page using the JWPL MediaWiki Parser.
*
* @param text the talk page text with markup
* @return a list of extracted sections that contain each contain a list of paragraphs
*/
public static List<ExtractedSection> getSectionsWithJWPL(String text){
List<ExtractedSection> sections = new ArrayList<>();
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.setCalculateSrcSpans(true);
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(text);
for(Section sec: pp.getSections()){
ExtractedSection sect = new ExtractedSection(sec.getTitle(), sec.getParagraphs());
sect.addNestedLists(sec.getNestedLists());
sections.add(sect);
}
return sections;
}
开发者ID:DiscourseDB,项目名称:discoursedb-core,代码行数:20,代码来源:WikitextParseUtils.java
示例11: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args) throws WikiApiException {
//db connection settings
DatabaseConfiguration dbConfig = new DatabaseConfiguration();
dbConfig.setDatabase("DATABASE");
dbConfig.setHost("HOST");
dbConfig.setUser("USER");
dbConfig.setPassword("PASSWORD");
dbConfig.setLanguage(Language.english);
//initialize a wiki
Wikipedia wiki = new Wikipedia(dbConfig);
//get the page 'Dog'
Page p = wiki.getPage("Dog");
//get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(p.getText());
//get the sections of the page
List<Section> sections = pp.getSections();
for(Section section : sections) {
System.out.println(section.getTitle());
}
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:28,代码来源:T4_InterfacingWithWikipedia.java
示例12: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args) throws WikiApiException {
//db connection settings
DatabaseConfiguration dbConfig = new DatabaseConfiguration();
dbConfig.setDatabase("DATABASE");
dbConfig.setHost("HOST");
dbConfig.setUser("USER");
dbConfig.setPassword("PASSWORD");
dbConfig.setLanguage(Language.english);
//initialize a wiki
Wikipedia wiki = new Wikipedia(dbConfig);
//get the page 'Dog'
Page p = wiki.getPage("Dog");
//get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements
String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition,
// e.g. "Image" in English
// filtering Image-Elements
pf.getImageIdentifers().add(IMAGE);
// parse page text
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(p.getText());
System.out.println(pp.getText());
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:33,代码来源:T5_CleaningTemplateImage.java
示例13: testParsedPage
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
@Test
public void testParsedPage(){
String title = "Wikipedia API";
Page p = null;
try {
p = wiki.getPage(title);
} catch (WikiApiException e) {
e.printStackTrace();
fail("A WikiApiException occured while getting the page " + title);
}
String LF = "\n";
String text = "Wikipedia API ist die wichtigste Software überhaupt." + LF +
"Wikipedia API. Nicht zu übertreffen. Unglaublich http://www.ukp.tu-darmstadt.de en:Wikipedia API";
MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english);
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(p.getText());
int i=0;
for (Link link : pp.getSection(0).getLinks()) {
if (i==0) {
assertEquals("Software", link.getText());
}
else if (i==1) {
assertEquals("Wikipedia API", link.getText());
assertEquals("JWPL", link.getTarget());
}
i++;
}
assertEquals(text, pp.getText());
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:36,代码来源:ParsedPageTest.java
示例14: parseInternalLinks
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static List<String> parseInternalLinks(String text) {
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(text);
List<String> internalLinks = new LinkedList<>();
if (pp != null) {
for (Link link : pp.getLinks()) {
if (link.getType() == Link.type.INTERNAL) {
internalLinks.add(link.getTarget());
}
}
}
return internalLinks;
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:16,代码来源:WikiTextParser.java
示例15: main
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static void main(String[] args) throws WikiApiException {
//db connection settings
DatabaseConfiguration dbConfig = new DatabaseConfiguration();
dbConfig.setDatabase("DATABASE");
dbConfig.setHost("HOST");
dbConfig.setUser("USER");
dbConfig.setPassword("PASSWORD");
dbConfig.setLanguage(Language.english);
//initialize a wiki
Wikipedia wiki = new Wikipedia(dbConfig);
//get the page 'Dog'
Page p = wiki.getPage("Dog");
//get a ParsedPage object
MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english);
pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements
String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition,
// e.g. "Image" in English
// filtering Image-Elements
pf.getImageIdentifers().add(IMAGE);
// parse page text
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(p.getText());
System.out.println(pp.getText());
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:33,代码来源:T5_CleaningTemplateImage.java
示例16: LinkAnchorExtractor
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public LinkAnchorExtractor(MediaWikiParser parser){
this.parser=parser;
}
开发者ID:dkpro,项目名称:dkpro-jwpl,代码行数:4,代码来源:LinkAnchorExtractor.java
示例17: drillMoreInfo
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public void drillMoreInfo() {
internalLinks = new ArrayList<>();
categories = new HashSet<>();
if(!isArticle() && !isCategory()) return;
//parse category
if (isCategory()) {
commonCategory = true;
//if it's root category, return as normal category
if(!conf.getWikiRootCategoryName().equalsIgnoreCase
(getCategoryTitle())){
//解析所隶属的类别
categories = WikiTextParser.parseCategories(text);
if(CollectionUtils.isEmpty(categories)) {
commonCategory = false;
//judge is a category redirect or not
this.redirect = WikiTextParser.parseCategoryRedirect(text);
if (!isRedirect()) {
this.commonsCatTag = WikiTextParser.parseCommonsCat(text);
}
}
}
return;
}
categories = WikiTextParser.parseCategories(text);
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(text);
if (pp == null) {
plainText = "";
System.out.println("text parse error: id==>" + id + ", title==>" + title + ", ns==>" + ns + ", content==>" + text);
return;
}
plainText = "";
for (Section s : pp.getSections()) {
if (s.getTitle() != null) {
plainText += s.getTitle() + "\n";
}
for (Paragraph p : s.getParagraphs()) {
String par = p.getText();
if (par.startsWith("TEMPLATE")) {
continue;
}
if (par.matches("[^:]+:[^\\ ]+")) {
continue;
}
plainText += par + "\n\n";
}
}
for (Link link : pp.getLinks()) {
if (link.getType() == Link.type.INTERNAL) {
internalLinks.add(link.getTarget());
}
pageLinks.add(link);
}
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:64,代码来源:WikiPage.java
示例18: getPlainText
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public String getPlainText() {
if (plainText == null) {
StringBuilder sb = new StringBuilder();
MediaWikiParserFactory pf = new MediaWikiParserFactory();
MediaWikiParser parser = pf.createParser();
ParsedPage pp = parser.parse(text);
if (pp != null) {
for (Section s : pp.getSections()) {
if (s.getTitle() != null) {
sb.append(s.getTitle()).append("\n");
//plainText += s.getTitle() + "\n";
}
for (Paragraph p : s.getParagraphs()) {
String par = p.getText();
if (par.startsWith("TEMPLATE")) {
continue;
}
if (par.matches("[^:]+:[^\\ ]+")) {
continue;
}
//处理内容里面是否包含TEMPLATE
Pattern pattern = Pattern.compile("TEMPLATE\\[[^\\]]+\\]", Pattern
.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(par);
int last = 0;
while(matcher.find(last)) {
sb.append(par.substring(last, matcher.start()));
last = matcher.end();
}
sb.append(par.substring(last));
sb.append("\n\n");
//plainText += par + "\n\n";
//sb.append(par).append("\n\n");
}
}
}
plainText = sb.toString();
}
return plainText;
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:45,代码来源:WikiPage.java
示例19: getDocumentFromText
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public Document_Lara getDocumentFromText(String path) throws UnsupportedEncodingException, FileNotFoundException{
/**
* ParserFactory
*/
IO_Service io_service = new IO_Service();
String wikiText = io_service.readFile(path);
MediaWikiParserFactory parserFactory = new MediaWikiParserFactory(
Language.french);
// Add ParserTemplate
parserFactory.setTemplateParserClass(FrenchTemplateParser.class);
MediaWikiParser mediawiki_parser = parserFactory.createParser();
wikiText = preParseWikiText(wikiText);
ParsedPage parsedpage = mediawiki_parser.parse(wikiText);
Document_Lara currDocument = new Document_Lara();
currDocument.setName("mediawiki" + "");
currListChunk = new ArrayList<Chunk_Lara>();
Chunk_Lara root_Chunk = new Chunk_Lara(0, 0, 0, 0);
root_Chunk.setType("root");
root_Chunk.setDepRel("");
root_Chunk.setDepId(-1);
root_Chunk.setText("mediawiki");
currListChunk.add(root_Chunk);
for (Content currContent : parsedpage.getSections()) {
runDependencyRecursive(0, currContent);
}
int index = 0;
for (Chunk_Lara currChunk : currListChunk) {
currChunk.setId(index);
index++;
}
currDocument.setChunk(currListChunk);
ShiftReduce_Service shiftReduce_Service = new ShiftReduce_Service();
currDocument = shiftReduce_Service
.assign_shiftreduce(currDocument, 0);
return currDocument;
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:47,代码来源:Wikipedia_Service.java
示例20: runDependencyBase
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; //导入依赖的package包/类
public static Document_Lara runDependencyBase(Page page)
throws WikiTitleParsingException {
/**
* ParserFactory
*/
MediaWikiParserFactory parserFactory = new MediaWikiParserFactory(
Language.french);
// Add ParserTemplate
parserFactory.setTemplateParserClass(FrenchTemplateParser.class);
MediaWikiParser mediawiki_parser = parserFactory.createParser();
String wikiText = page.getText();
wikiText = preParseWikiText(wikiText);
ParsedPage parsedpage = mediawiki_parser.parse(wikiText);
Document_Lara currDocument = new Document_Lara();
currDocument.setName(page.getTitle() + "");
currListChunk = new ArrayList<Chunk_Lara>();
Chunk_Lara root_Chunk = new Chunk_Lara(0, 0, 0, 0);
root_Chunk.setType("root");
root_Chunk.setDepRel("");
root_Chunk.setDepId(-1);
root_Chunk.setText(page.getTitle()+"");
currListChunk.add(root_Chunk);
for (Content currContent : parsedpage.getSections()) {
runDependencyRecursive(0, currContent);
}
int index = 0;
for (Chunk_Lara currChunk : currListChunk) {
currChunk.setId(index);
index++;
}
currDocument.setChunk(currListChunk);
ShiftReduce_Service shiftReduce_Service = new ShiftReduce_Service();
currDocument = shiftReduce_Service
.assign_shiftreduce(currDocument, 0);
return currDocument;
}
开发者ID:fauconnier,项目名称:LaToe,代码行数:44,代码来源:Wikipedia_Service.java
注:本文中的de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论