本文整理汇总了Java中org.apache.nutch.indexer.IndexingException类的典型用法代码示例。如果您正苦于以下问题:Java IndexingException类的具体用法?Java IndexingException怎么用?Java IndexingException使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
IndexingException类属于org.apache.nutch.indexer包,在下文中一共展示了IndexingException类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:URLMetaIndexingFilter.java
示例2: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:LanguageIndexingFilter.java
示例3: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
/**
* {@inheritDoc}
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (doc != null) {
if (FIELDREPLACERS_BY_HOST.size() > 0) {
this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
}
if (FIELDREPLACERS_BY_URL.size() > 0) {
this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
}
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:ReplaceIndexer.java
示例4: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
ParseData dataP = parse.getData();
Metadata meta = dataP.getParseMeta();
boolean index = false;
for (String key : meta.names()) {
if(key.equals("ogc_service"))
index = true;
String value = meta.get(key);
LOG.info("Adding " + url + " to NutchDocument");
doc.add(key, value);
}
/* Return the document if it is an ogc service, otherwise return null */
return index ? doc : null;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:OgcIndexingFilter.java
示例5: testOgcIndexingFilter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Test
public void testOgcIndexingFilter() throws FileNotFoundException, URISyntaxException, IndexingException {
File f = new File(getClass().getResource("testWMS.xml").toURI());
@SuppressWarnings("resource")
String contentValue = new Scanner(f).useDelimiter("\\Z").next();
ParseResult testParseResult = Utils.createParseResultWithMetadata(new Metadata(), url);
Content testContent = Utils.createContent(url, contentValue);
OgcIndexingFilter indexingFilter = new OgcIndexingFilter();
OgcParseFilter parseFilter = new OgcParseFilter();
ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);
parse = res.get(url);
NutchDocument doc = indexingFilter.filter(nutchDocument, parse, urlText, datum, inlinks);
assertTrue("Comprobación de que el campo ogc_version esta indexado",
doc.getFieldNames().contains("ogc_version"));
assertTrue("Comprobación de que el campo ogc_service esta indexado",
doc.getFieldNames().contains("ogc_service"));
assertTrue("Comprobación de que el campo raw_content esta indexado",
doc.getFieldNames().contains("raw_content"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:OgcIndexingFilterTest.java
示例6: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
开发者ID:yahoo,项目名称:anthelion,代码行数:25,代码来源:URLMetaIndexingFilter.java
示例7: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
开发者ID:yahoo,项目名称:anthelion,代码行数:20,代码来源:LanguageIndexingFilter.java
示例8: getSiteHashFromJsonStream
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
/**
* gets the siteHash from the received Json data in form of InputStream
*
* @param stream
* with Json data
* @return siteHash
* @throws IndexingException
*/
protected String getSiteHashFromJsonStream(InputStream stream)
throws IndexingException {
try {
JsonNode rootNode = jsonMapper.readValue(stream, JsonNode.class);
String siteHash = rootNode.get("sitehash").getTextValue();
LOG.info("TYPO3 Solr siteHash retrieved: " + siteHash);
return siteHash;
} catch (Exception e) {
LOG.error("ERROR! could not receive correct siteHash data from the Solr TYPO3 Api");
throw (new IndexingException(e));
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException streamException) {
LOG.error(streamException.getMessage());
}
}
}
}
开发者ID:dkd,项目名称:nutch-typo3-cms,代码行数:32,代码来源:SiteHashIndexingFilter.java
示例9: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// convert ISO date to time stamp
String isoDate = conf.get(CONF_ENDTIME_PROPERTY, "1970-01-01T00:00:00Z");
long epoch = 0;
try {
epoch = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(isoDate).getTime();
} catch (ParseException e) {
LOG.error("ERROR! Cannot parse date, must fit pattern yyyy-MM-dd'T'HH:mm:ssZ : " + isoDate);
}
// Index the endtime
doc.add(INDEXING_FIELD, new Date(epoch));
return doc;
}
开发者ID:dkd,项目名称:nutch-typo3-cms,代码行数:19,代码来源:EndtimeIndexingFilter.java
示例10: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument document, String s, WebPage webPage) throws IndexingException {
if (storageField != null) {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
try {
String strippedContent = decoder.decode(webPage.getMetadata().get(new Utf8(storageField))).toString();
if (strippedContent != null) {
document.add(storageField, strippedContent);
}
} catch (CharacterCodingException e) {
e.printStackTrace();
}
}
return document;
}
开发者ID:kaqqao,项目名称:nutch-element-selector,代码行数:17,代码来源:HtmlElementSelectorIndexer.java
示例11: testOgcIndexingFilter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Test
public void testOgcIndexingFilter() throws FileNotFoundException, URISyntaxException, IndexingException {
int results = th.execQuery("agua");
assertEquals(results, 1);
results = th.execQuery("вода");
assertEquals(results, 1);
results = th.execQuery("Mar");
assertEquals(results, 30);
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:11,代码来源:ThesaurusTest.java
示例12: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
String[] tags = parse.getData().getParseMeta()
.getValues(RelTagParser.REL_TAG);
if (tags != null) {
for (int i = 0; i < tags.length; i++) {
doc.add("tag", tags[i]);
}
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:15,代码来源:RelTagIndexingFilter.java
示例13: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
doc.add("tld", d.getDomain());
} catch (Exception ex) {
LOG.warn(ex.toString());
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:16,代码来源:TLDIndexingFilter.java
示例14: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String url_s = url.toString();
addTime(doc, parse.getData(), url_s, datum);
addLength(doc, parse.getData(), url_s);
addType(doc, parse.getData(), url_s, datum);
resetTitle(doc, parse.getData(), url_s);
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:13,代码来源:MoreIndexingFilter.java
示例15: assertContentType
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
private void assertContentType(Configuration conf, String source,
String expected) throws IndexingException {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, source);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
"text", new ParseData(new ParseStatus(), "title", new Outlink[0],
metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
new Inlinks());
Assert.assertEquals("mime type not detected", expected,
doc.getFieldValue("type"));
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:14,代码来源:TestMoreIndexingFilter.java
示例16: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
/**
* The {@link AnchorIndexingFilter} filter object which supports boolean
* configuration settings for the deduplication of anchors. See
* {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
*
* @param doc
* The {@link NutchDocument} object
* @param parse
* The relevant {@link Parse} object passing through the filter
* @param url
* URL to be filtered for anchor text
* @param datum
* The {@link CrawlDatum} entry
* @param inlinks
* The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
HashSet<String> set = null;
for (int i = 0; i < anchors.length; i++) {
if (deduplicate) {
if (set == null)
set = new HashSet<String>();
String lcAnchor = anchors[i].toLowerCase();
// Check if already processed the current anchor
if (!set.contains(lcAnchor)) {
doc.add("anchor", anchors[i]);
// Add to map
set.add(lcAnchor);
}
} else {
doc.add("anchor", anchors[i]);
}
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:45,代码来源:AnchorIndexingFilter.java
示例17: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
// index the license
String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
if (licenseUrl != null) {
if (LOG.isInfoEnabled()) {
LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
}
// add the entire license as cc:license=xxx
addFeature(doc, "license=" + licenseUrl);
// index license attributes extracted of the license url
addUrlFeatures(doc, licenseUrl);
}
// index the license location as cc:meta=xxx
String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
if (licenseLocation != null) {
addFeature(doc, "meta=" + licenseLocation);
}
// index the work type cc:type=xxx
String workType = metadata.get(CreativeCommons.WORK_TYPE);
if (workType != null) {
addFeature(doc, workType);
}
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:33,代码来源:CCIndexingFilter.java
示例18: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String text = parse.getText();
doc.add("length", text.length());
return doc;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:8,代码来源:LengthIndexingFilter.java
示例19: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
if (tags != null) {
for (int i=0; i<tags.length; i++) {
doc.add("tag", tags[i]);
}
}
return doc;
}
开发者ID:yahoo,项目名称:anthelion,代码行数:14,代码来源:RelTagIndexingFilter.java
示例20: filter
import org.apache.nutch.indexer.IndexingException; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
doc.add("tld", d.getDomain());
}catch (Exception ex) {
LOG.warn(ex.toString());
}
return doc;
}
开发者ID:yahoo,项目名称:anthelion,代码行数:16,代码来源:TLDIndexingFilter.java
注:本文中的org.apache.nutch.indexer.IndexingException类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论