本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument类的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument类的具体用法?Java NutchDocument怎么用?Java NutchDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
NutchDocument类属于org.apache.nutch.indexer包,在下文中一共展示了NutchDocument类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: indexerScore
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if (tlds != null) {
for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if (entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TLDScoringFilter.java
示例2: testEmptyIndexStatic
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* Test that empty {@code index.static} does not add anything to the document
*
* @throws Exception
*/
@Test
public void testEmptyIndexStatic() throws Exception {
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("tests if no field is set for empty index.static", doc
.getFieldNames().isEmpty());
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:TestStaticFieldIndexerTest.java
示例3: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:URLMetaIndexingFilter.java
示例4: addTime
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
CrawlDatum datum) {
long time = -1;
String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified, url); // use as time
// store as string
doc.add("lastModified", new Date(time));
}
if (time == -1) { // if no last-modified specified in HTTP header
time = datum.getModifiedTime(); // use value in CrawlDatum
if (time <= 0) { // if also unset
time = datum.getFetchTime(); // use time the fetch took place (fetchTime
// of fetchDatum)
}
}
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:MoreIndexingFilter.java
示例5: testNoParts
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* @since NUTCH-901
*/
@Test
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue(doc.getFieldNames().contains("type"));
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:26,代码来源:TestMoreIndexingFilter.java
示例6: testMissingConfigFile
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testMissingConfigFile() throws Exception {
String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
Assert.assertEquals(String
.format("Property %s must not be present in the the configuration file",
MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
filter.setConf(conf);
// property not set so in this cases all documents must pass the filter
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNotNull("All documents must be allowed by default", doc);
}
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:MimeTypeIndexingFilterTest.java
示例7: testAllowOnlyImages
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testAllowOnlyImages() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("image")) {
Assert.assertNotNull("Allow only images", doc);
} else {
Assert.assertNull("Block everything else", doc);
}
}
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:MimeTypeIndexingFilterTest.java
示例8: testBlockHTML
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testBlockHTML() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("html")) {
Assert.assertNull("Block only HTML documents", doc);
} else {
Assert.assertNotNull("Allow everything else", doc);
}
}
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:MimeTypeIndexingFilterTest.java
示例9: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:LanguageIndexingFilter.java
示例10: testDeduplicateAnchor
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("anchorIndexingFilter.deduplicate", true);
AnchorIndexingFilter filter = new AnchorIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://test1.com/", "text1"));
inlinks.add(new Inlink("http://test2.com/", "text2"));
inlinks.add(new Inlink("http://test3.com/", "text2"));
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
.contains("anchor"));
Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
.getValues().size());
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:27,代码来源:TestAnchorIndexingFilter.java
示例11: addUrlFeatures
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* Add the features represented by a license URL. Urls are of the form
* "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
* license feature.
*/
public void addUrlFeatures(NutchDocument doc, String urlString) {
try {
URL url = new URL(urlString);
// tokenize the path of the url, breaking at slashes and dashes
StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
if (names.hasMoreTokens())
names.nextToken(); // throw away "licenses"
// add a feature per component after "licenses"
while (names.hasMoreTokens()) {
String feature = names.nextToken();
addFeature(doc, feature);
}
} catch (MalformedURLException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
}
}
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:27,代码来源:CCIndexingFilter.java
示例12: testFilterOutlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterOutlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks();
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals("Filter outlinks, allow only those from a different host",
outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TestLinksIndexingFilter.java
示例13: testFilterInlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
Assert.assertEquals("Filter inlinks, allow only those from a different host",
"http://www.test.com", doc.getFieldValue("inlinks"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:TestLinksIndexingFilter.java
示例14: testNoFilterInlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testNoFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals("All inlinks must be indexed even those from the same host",
inlinks.size(), doc.getField("inlinks").getValues().size());
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TestLinksIndexingFilter.java
示例15: testIndexHostsOnlyAndFilterOutlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
Outlink[] outlinks = generateOutlinks(true);
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals(
"Index only the host portion of the outlinks after filtering",
new URL("http://www.test.com").getHost(),
doc.getFieldValue("outlinks"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestLinksIndexingFilter.java
示例16: testIndexHostsOnlyAndFilterInlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
Assert.assertEquals(
"Index only the host portion of the inlinks after filtering",
new URL("http://www.test.com").getHost(),
doc.getFieldValue("inlinks"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:TestLinksIndexingFilter.java
示例17: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* {@inheritDoc}
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (doc != null) {
if (FIELDREPLACERS_BY_HOST.size() > 0) {
this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
}
if (FIELDREPLACERS_BY_URL.size() > 0) {
this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
}
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:ReplaceIndexer.java
示例18: testReplacementsWithFlags
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* Test a replacement pattern that uses the flags feature.
*
* A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
* any case.
*/
@Test
public void testReplacementsWithFlags() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2";
Configuration conf = NutchConfiguration.create();
conf.set(
"plugin.includes",
"protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md",
"metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the value produced by the case-insensitive replacement has
// worked.
Assert.assertEquals(expectedDescription,
doc.getFieldValue("metatag.description"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:31,代码来源:TestIndexReplace.java
示例19: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
ParseData dataP = parse.getData();
Metadata meta = dataP.getParseMeta();
boolean index = false;
for (String key : meta.names()) {
if(key.equals("ogc_service"))
index = true;
String value = meta.get(key);
LOG.info("Adding " + url + " to NutchDocument");
doc.add(key, value);
}
/* Return the document if it is an ogc service, otherwise return null */
return index ? doc : null;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:OgcIndexingFilter.java
示例20: testOgcIndexingFilter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testOgcIndexingFilter() throws FileNotFoundException, URISyntaxException, IndexingException {
File f = new File(getClass().getResource("testWMS.xml").toURI());
@SuppressWarnings("resource")
String contentValue = new Scanner(f).useDelimiter("\\Z").next();
ParseResult testParseResult = Utils.createParseResultWithMetadata(new Metadata(), url);
Content testContent = Utils.createContent(url, contentValue);
OgcIndexingFilter indexingFilter = new OgcIndexingFilter();
OgcParseFilter parseFilter = new OgcParseFilter();
ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);
parse = res.get(url);
NutchDocument doc = indexingFilter.filter(nutchDocument, parse, urlText, datum, inlinks);
assertTrue("Comprobación de que el campo ogc_version esta indexado",
doc.getFieldNames().contains("ogc_version"));
assertTrue("Comprobación de que el campo ogc_service esta indexado",
doc.getFieldNames().contains("ogc_service"));
assertTrue("Comprobación de que el campo raw_content esta indexado",
doc.getFieldNames().contains("raw_content"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:OgcIndexingFilterTest.java
注:本文中的org.apache.nutch.indexer.NutchDocument类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论