本文整理汇总了Java中org.archive.io.warc.WARCReaderFactory类的典型用法代码示例。如果您正苦于以下问题:Java WARCReaderFactory类的具体用法?Java WARCReaderFactory怎么用?Java WARCReaderFactory使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
WARCReaderFactory类属于org.archive.io.warc包,在下文中一共展示了WARCReaderFactory类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: processWarc
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
private void processWarc(Path warcFile) throws IOException {
extractorStats.addWarc(warcFile.getFileName().toString());
InputStream is = Files.newInputStream(warcFile);
ArchiveReader reader = WARCReaderFactory.get(warcFile.toString(), is, true);
int i = 0;
reader.setStrict(false);
for (ArchiveRecord record : reader) {
record.setStrict(false);
extractorStats.visitedRecord();
handleRecord(record);
if (i++ % 1000 == 0) {
System.err.println(extractorStats);
}
}
}
开发者ID:tballison,项目名称:SimpleCommonCrawlExtractor,代码行数:17,代码来源:AbstractExtractor.java
示例2: generate
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void generate(Path path, int numPages) throws Exception {
Gson gson = new Gson();
long count = 0;
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
log.debug("Skipping {}", p.getUrl());
continue;
}
log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
String json = gson.toJson(p);
writer.write(json);
writer.newLine();
count++;
if (count == numPages) {
break;
} else if ((count % 1000) == 0) {
log.info("Wrote {} of {} pages to {}", count, numPages, path);
}
}
}
log.info("Wrote {} pages to {}", numPages, path);
}
开发者ID:astralway,项目名称:webindex,代码行数:27,代码来源:SampleData.java
示例3: readBz2
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
/**
* Reads bz2 warc file
*
* @param file warc file
* @throws IOException
*/
public static void readBz2(String file)
throws IOException
{
// decompress bz2 file to tmp file
File tmpFile = File.createTempFile("tmp", ".warc");
BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(
new FileInputStream(file));
IOUtils.copy(inputStream, new FileOutputStream(tmpFile));
WARCReader reader = WARCReaderFactory.get(tmpFile);
int counter = 0;
for (ArchiveRecord record : reader) {
System.out.println(record.getHeader().getHeaderFields());
counter++;
}
FileUtils.forceDelete(tmpFile);
System.out.println(counter);
}
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:30,代码来源:WARCReaderTest.java
示例4: testARCReaderClose
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public void testARCReaderClose() {
try {
final File testfile = new File(ARCHIVE_DIR + testFileName);
FileUtils.copyFile(new File(ARCHIVE_DIR + "fyensdk.warc"),
testfile);
WARCReader reader = WARCReaderFactory.get(testfile);
WARCRecord record = (WARCRecord) reader.get(0);
BitarchiveRecord rec =
new BitarchiveRecord(record, testFileName);
record.close();
reader.close();
testfile.delete();
} catch (IOException e) {
fail("Should not throw IOException " + e);
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:19,代码来源:WARCReaderTester.java
示例5: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-23/segments/1404776400583.60/warc/CC-MAIN-20140707234000-00000-ip-10-180-212-248.ec2.internal.warc.gz";
// String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
String fn = url;
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:36,代码来源:WARCReaderTest.java
示例6: initialize
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
arPath = path.getName();
ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:12,代码来源:WARCFileRecordReader.java
示例7: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.error("Usage: TestParser <pathsFile> <range>");
System.exit(1);
}
final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
if (loadList.isEmpty()) {
log.error("No files to load given {} {}", args[0], args[1]);
System.exit(1);
}
WebIndexConfig.load();
SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
args[0]);
JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
final String prefix = WebIndexConfig.CC_URL_PREFIX;
loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
String urlToCopy = prefix + path;
log.info("Parsing {}", urlToCopy);
try {
ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
for (ArchiveRecord record : reader) {
ArchiveUtil.buildPageIgnoreErrors(record);
}
} catch (Exception e) {
log.error("Exception while processing {}", path, e);
}
}));
}
}
开发者ID:astralway,项目名称:webindex,代码行数:39,代码来源:TestParser.java
示例8: initialize
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
arPath = path.getName();
ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
开发者ID:astralway,项目名称:webindex,代码行数:12,代码来源:WARCFileRecordReader.java
示例9: readPages
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static Map<URL, Page> readPages(File input) throws Exception {
Map<URL, Page> pageMap = new HashMap<>();
ArchiveReader ar = WARCReaderFactory.get(input);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
continue;
}
pageMap.put(URL.fromUri(p.getUri()), p);
}
ar.close();
return pageMap;
}
开发者ID:astralway,项目名称:webindex,代码行数:14,代码来源:IndexIT.java
示例10: read
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
/**
* Reads default (gzipped) warc file
*
* @param file gz file
* @throws IOException
*/
public static void read(String file)
throws IOException
{
WARCReader reader = WARCReaderFactory.get(new File(file));
int counter = 0;
for (ArchiveRecord record : reader) {
System.out.println(record.getHeader().getHeaderFields());
counter++;
}
System.out.println(counter);
}
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:21,代码来源:WARCReaderTest.java
示例11: openFile
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
private WARCReader openFile(Path filePath) throws IOException {
return WARCReaderFactory.get(filePath.toFile());
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:4,代码来源:WarcTargetRepository.java
示例12: open
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static ArchiveReader open(Path path) throws IOException {
/*
* ArchiveReaderFactor.get doesn't understand the .open extension.
*/
if (path.toString().endsWith(".warc.gz.open")) {
return WARCReaderFactory.get(path.toFile());
} else {
return ArchiveReaderFactory.get(path.toFile());
}
}
开发者ID:nla,项目名称:bamboo,代码行数:11,代码来源:WarcUtils.java
示例13: testWarcCopy
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public void testWarcCopy() {
try {
byte[] warcBytes = (
"WARC/1.0\r\n"
+ "WARC-Type: metadata\r\n"
+ "WARC-Target-URI: metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86\r\n"
+ "WARC-Date: 2012-08-24T11:42:55Z\r\n"
+ "WARC-Record-ID: <urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>\r\n"
+ "WARC-Payload-Digest: sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U\r\n"
+ "WARC-IP-Address: 207.241.229.39\r\n"
+ "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb30>\r\n"
+ "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>\r\n"
+ "Content-Type: text/plain\r\n"
+ "Content-Length: 2\r\n"
+ "\r\n"
+ "85"
+ "\r\n"
+ "\r\n").getBytes();
File orgFile = new File(TestInfo.WORKING_DIR, "original4copy.warc");
FileUtils.writeBinaryFile(orgFile, warcBytes);
File copiedFile = new File(TestInfo.WORKING_DIR, "copied.warc");
WARCWriter writer = WARCUtils.createWARCWriter(copiedFile);
WARCUtils.insertWARCFile(orgFile, writer);
writer.close();
byte[] bytes = FileUtils.readBinaryFile(copiedFile);
//System.out.println( new String(bytes));
WARCReader reader = WARCReaderFactory.get(copiedFile);
Assert.assertNotNull(reader);
ArchiveRecord record = reader.get();
Assert.assertNotNull(record);
ArchiveRecordHeader header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertEquals("metadata", header.getHeaderValue("WARC-Type"));
Assert.assertEquals("metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86", header.getHeaderValue("WARC-Target-URI"));
Assert.assertEquals("2012-08-24T11:42:55Z", header.getHeaderValue("WARC-Date"));
Assert.assertEquals("<urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>", header.getHeaderValue("WARC-Record-ID"));
Assert.assertEquals("sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", header.getHeaderValue("WARC-Payload-Digest"));
Assert.assertEquals("207.241.229.39", header.getHeaderValue("WARC-IP-Address"));
Assert.assertEquals("<urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>", header.getHeaderValue("WARC-Concurrent-To"));
Assert.assertEquals("text/plain", header.getHeaderValue("Content-Type"));
Assert.assertEquals("2", header.getHeaderValue("Content-Length"));
}
catch (IOException e) {
e.printStackTrace();
Assert.fail("Unexpected exception!");
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:53,代码来源:WARCUtilsTester.java
示例14: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void main(String[] args) throws IOException, S3ServiceException {
// We're accessing a publicly available bucket so don't need to fill in our credentials
S3Service s3s = new RestS3Service(null);
// Let's grab a file out of the CommonCrawl S3 bucket
String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println("Header: " + r.getHeader());
System.out.println("URL: " + r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the record's stated length
byte[] rawData = new byte[r.available()];
r.read(rawData);
// Note: potential optimization would be to have a large buffer only allocated once
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
开发者ID:Smerity,项目名称:cc-warc-examples,代码行数:36,代码来源:S3ReaderTest.java
示例15: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
开发者ID:Smerity,项目名称:cc-warc-examples,代码行数:34,代码来源:WARCReaderTest.java
示例16: getArchiveReader
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
protected ArchiveReader getArchiveReader(final File f,
final long offset)
throws IOException {
if (ARCReaderFactory.isARCSuffix(f.getName())) {
return ARCReaderFactory.get(f, true, offset);
} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
return WARCReaderFactory.get(f, offset);
}
throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ f.getName());
}
开发者ID:iipc,项目名称:webarchive-commons,代码行数:12,代码来源:ArchiveReaderFactory.java
示例17: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void main(String[] args) throws IOException, S3ServiceException {
// We're accessing a publicly available bucket so don't need to fill in our credentials
S3Service s3s = new RestS3Service(null);
// Let's grab a file out of the CommonCrawl S3 bucket
String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar;
try {
ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println("Header: " + r.getHeader());
System.out.println("URL: " + r.getHeader().getUrl());
// System.out.println(r.);
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the record's stated length
byte[] rawData = new byte[r.available()];
r.read(rawData);
// Note: potential optimization would be to have a large buffer only allocated once
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
} catch (ServiceException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:44,代码来源:S3ReaderTest.java
示例18: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.error("Usage: LoadS3 <pathsFile> <range>");
System.exit(1);
}
final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
if (loadList.isEmpty()) {
log.error("No files to load given {} {}", args[0], args[1]);
System.exit(1);
}
final int rateLimit = WebIndexConfig.load().getLoadRateLimit();
SparkConf sparkConf = new SparkConf().setAppName("webindex-load-s3");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
log.info("Loading {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
args[0]);
JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
final String prefix = WebIndexConfig.CC_URL_PREFIX;
loadRDD.foreachPartition(iter -> {
final FluoConfiguration fluoConfig = new FluoConfiguration(new File("fluo.properties"));
final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
try (FluoClient client = FluoFactory.newClient(fluoConfig);
LoaderExecutor le = client.newLoaderExecutor()) {
iter.forEachRemaining(path -> {
String urlToCopy = prefix + path;
log.info("Loading {} to Fluo", urlToCopy);
try {
ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
for (ArchiveRecord record : reader) {
Page page = ArchiveUtil.buildPageIgnoreErrors(record);
if (page.getOutboundLinks().size() > 0) {
log.info("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks()
.size());
if (rateLimiter != null) {
rateLimiter.acquire();
}
le.execute(PageLoader.updatePage(page));
}
}
} catch (Exception e) {
log.error("Exception while processing {}", path, e);
}
});
}
});
}
}
开发者ID:astralway,项目名称:webindex,代码行数:54,代码来源:LoadS3.java
示例19: main
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
log.error("Usage: LoadHdfs <dataDir>");
System.exit(1);
}
final String dataDir = args[0];
IndexEnv.validateDataDir(dataDir);
final String hadoopConfDir = IndexEnv.getHadoopConfDir();
final int rateLimit = WebIndexConfig.load().getLoadRateLimit();
List<String> loadPaths = new ArrayList<>();
FileSystem hdfs = IndexEnv.getHDFS();
RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(new Path(dataDir), true);
while (listIter.hasNext()) {
LocatedFileStatus status = listIter.next();
if (status.isFile()) {
loadPaths.add(status.getPath().toString());
}
}
log.info("Loading {} files into Fluo from {}", loadPaths.size(), dataDir);
SparkConf sparkConf = new SparkConf().setAppName("webindex-load-hdfs");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
JavaRDD<String> paths = ctx.parallelize(loadPaths, loadPaths.size());
paths.foreachPartition(iter -> {
final FluoConfiguration fluoConfig = new FluoConfiguration(new File("fluo.properties"));
final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
try (FluoClient client = FluoFactory.newClient(fluoConfig);
LoaderExecutor le = client.newLoaderExecutor()) {
iter.forEachRemaining(path -> {
Path filePath = new Path(path);
try {
if (fs.exists(filePath)) {
FSDataInputStream fsin = fs.open(filePath);
ArchiveReader reader = WARCReaderFactory.get(filePath.getName(), fsin, true);
for (ArchiveRecord record : reader) {
Page page = ArchiveUtil.buildPageIgnoreErrors(record);
if (page.getOutboundLinks().size() > 0) {
log.info("Loading page {} with {} links", page.getUrl(), page
.getOutboundLinks().size());
if (rateLimiter != null) {
rateLimiter.acquire();
}
le.execute(PageLoader.updatePage(page));
}
}
}
} catch (IOException e) {
log.error("Exception while processing {}", path, e);
}
});
}
});
}
}
开发者ID:astralway,项目名称:webindex,代码行数:62,代码来源:LoadHdfs.java
示例20: testBasic
import org.archive.io.warc.WARCReaderFactory; //导入依赖的package包/类
@Test
public void testBasic() throws IOException, ParseException {
ArchiveReader archiveReader = WARCReaderFactory.get(new File("src/test/resources/wat.warc"));
Page page = ArchiveUtil.buildPage(archiveReader.get());
Assert.assertNotNull(page);
Assert.assertFalse(page.isEmpty());
Assert
.assertEquals(
"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
page.getUrl());
Assert
.assertEquals(
"com.1079ishot>>o>/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
page.getUri());
Assert.assertEquals("2015-04-18T03:35:13Z", page.getCrawlDate());
Assert.assertEquals("nginx/1.6.2", page.getServer());
Assert
.assertEquals(
"Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at ",
page.getTitle());
Assert.assertEquals(0, page.getOutboundLinks().size());
ArchiveReader ar2 = WARCReaderFactory.get(new File("src/test/resources/wat-18.warc"));
int valid = 0;
int invalid = 0;
Iterator<ArchiveRecord> records = ar2.iterator();
while (records.hasNext()) {
try {
ArchiveRecord r = records.next();
ArchiveUtil.buildPage(r);
valid++;
} catch (ParseException e) {
invalid++;
}
}
Assert.assertEquals(18, valid);
Assert.assertEquals(0, invalid);
}
开发者ID:astralway,项目名称:webindex,代码行数:43,代码来源:ArchiveUtilTest.java
注:本文中的org.archive.io.warc.WARCReaderFactory类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论