本文整理汇总了Java中parquet.hadoop.metadata.BlockMetaData类的典型用法代码示例。如果您正苦于以下问题:Java BlockMetaData类的具体用法?Java BlockMetaData怎么用?Java BlockMetaData使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
BlockMetaData类属于parquet.hadoop.metadata包,在下文中一共展示了BlockMetaData类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: getRowGroupNumbersFromFileSplit
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:25,代码来源:HiveDrillNativeScanBatchCreator.java
示例2: initialize
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
Map<String, String> fileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = fileSchema;
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:InternalParquetRecordReader.java
示例3: initReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private void initReader() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
if (footersIterator.hasNext()) {
Footer footer = footersIterator.next();
List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
filter, blocks, fileSchema);
fileInfo.setBlockMetaDatas(blocks);
fileInfo.setFileSchema(fileSchema);
fileInfo.setFilteredBlocks(filteredBlocks);
reader = new InternalParquetRecordReader<T>(readSupport, filter);
reader.initialize(fileSchema,
footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
footer.getFile(), filteredBlocks, conf);
}
}
开发者ID:grokcoder,项目名称:pbase,代码行数:26,代码来源:ParquetReader.java
示例4: add
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:PrintFooter.java
示例5: mergeFooters
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(), blocks);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:ParquetFileWriter.java
示例6: checkBelongingToANewHDFSBlock
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* @param rowGroupMetadata
* @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
* return false if the mid point of row group is in the same hdfs block
*/
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
boolean isNewHdfsBlock = false;
long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);
//if mid point is not in the current HDFS block any more, return true
while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
isNewHdfsBlock = true;
currentMidPointHDFSBlockIndex++;
if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
+ rowGroupMidPoint
+ ", the end of the hdfs block is "
+ getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
}
while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
currentStartHdfsBlockIndex++;
if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
+ rowGroupMetadata.getStartingPos()
+ " but the end of hdfs blocks of file is "
+ getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
}
return isNewHdfsBlock;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:31,代码来源:ParquetInputFormat.java
示例7: generateSplits
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* groups together all the data blocks for the same HDFS block
*
* @param rowGroupBlocks data blocks (row groups)
* @param hdfsBlocksArray hdfs blocks
* @param fileStatus the containing file
* @param requestedSchema the schema requested by the user
* @param readSupportMetadata the metadata provided by the readSupport implementation in init
* @param minSplitSize the mapred.min.split.size
* @param maxSplitSize the mapred.max.split.size
* @return the splits (one per HDFS block)
* @throws IOException If hosts can't be retrieved for the HDFS block
*/
static <T> List<ParquetInputSplit> generateSplits(
List<BlockMetaData> rowGroupBlocks,
BlockLocation[] hdfsBlocksArray,
FileStatus fileStatus,
String requestedSchema,
Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException {
List<SplitInfo> splitRowGroups =
generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);
//generate splits from rowGroups of each split
List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>();
for (SplitInfo splitInfo : splitRowGroups) {
ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
resultSplits.add(split);
}
return resultSplits;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:32,代码来源:ParquetInputFormat.java
示例8: visit
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();
// check that the schema of the filter matches the schema of the file
SchemaCompatibilityValidator.validate(filterPredicate, schema);
List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();
for (BlockMetaData block : blocks) {
if (!StatisticsFilter.canDrop(filterPredicate, block.getColumns())) {
filteredBlocks.add(block);
}
}
return filteredBlocks;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:18,代码来源:RowGroupFilter.java
示例9: toParquetMetadata
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
int numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
return fileMetaData;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:23,代码来源:ParquetMetadataConverter.java
示例10: generateSplitByDeprecatedConstructor
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private List<ParquetInputSplit> generateSplitByDeprecatedConstructor(long min, long max) throws
IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
List<ClientSideMetadataSplitStrategy.SplitInfo> splitInfos = ClientSideMetadataSplitStrategy
.generateSplitInfo(blocks, hdfsBlocks, min, max);
for (ClientSideMetadataSplitStrategy.SplitInfo splitInfo : splitInfos) {
BlockMetaData lastRowGroup = splitInfo.getRowGroups().get(splitInfo.getRowGroupCount() - 1);
long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();
ParquetInputSplit split = new ParquetInputSplit(fileStatus.getPath(),
splitInfo.hdfsBlock.getOffset(), end, splitInfo.hdfsBlock.getHosts(),
splitInfo.rowGroups, schema.toString(), null, null, extramd);
splits.add(split);
}
return splits;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:TestInputFormat.java
示例11: ParquetReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetReader(MessageType fileSchema,
Map<String, String> extraMetadata,
MessageType requestedSchema,
Path file,
List<BlockMetaData> blocks,
Configuration configuration)
throws IOException
{
this.fileSchema = fileSchema;
this.extraMetadata = extraMetadata;
this.requestedSchema = requestedSchema;
this.file = file;
this.blocks = blocks;
this.configuration = configuration;
this.fileReader = new ParquetFileReader(configuration, file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : blocks) {
fileRowCount += block.getRowCount();
}
}
开发者ID:y-lan,项目名称:presto,代码行数:20,代码来源:ParquetReader.java
示例12: ParquetFileReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetFileReader(
Configuration configuration,
Path file,
List<BlockMetaData> blocks,
List<ColumnDescriptor> columns)
throws IOException
{
this.file = file;
this.inputStream = file.getFileSystem(configuration).open(file);
this.blocks = blocks;
if (!blocks.isEmpty()) {
for (ColumnDescriptor columnDescriptor : columns) {
for (ColumnChunkMetaData metadata : blocks.get(0).getColumns()) {
if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
columnMetadata.put(columnDescriptor, metadata);
}
}
}
}
this.codecFactory = new ParquetCodecFactory(configuration);
}
开发者ID:y-lan,项目名称:presto,代码行数:22,代码来源:ParquetFileReader.java
示例13: getBlocks
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* get the blocks (row groups) of the parquet files.
* @return
*/
public List<BlockMetaData> getBlocks ()
{
if (this.blockMetaDataList == null || this.blockMetaDataList.size() == 0)
{
this.blockMetaDataList = new ArrayList<BlockMetaData>();
for (ParquetFileMetadata meta : this.fileMetaDataList)
{
this.blockMetaDataList.addAll(meta.getBlocks());
}
}
return this.blockMetaDataList;
}
开发者ID:dbiir,项目名称:rainbow,代码行数:17,代码来源:ParquetMetadataStat.java
示例14: getTotalSize
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* get the total compressed size of the parquet files.
* @return
*/
@Override
public long getTotalSize ()
{
long size = 0;
for (BlockMetaData meta : this.getBlocks())
{
size += meta.getCompressedSize();
}
return size;
}
开发者ID:dbiir,项目名称:rainbow,代码行数:15,代码来源:ParquetMetadataStat.java
示例15: testGetStat
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Test
public void testGetStat () throws IOException, MetadataException
{
//ParquetMetadataStat stat = new ParquetMetadataStat("10.172.96.77", 9000, "/tmp/hive-root/hive_2015-02-04_10-57-36_131_1404874572956637570-1/_tmp.-ext-10002");
ParquetMetadataStat stat = new ParquetMetadataStat("192.168.124.15", 9000, "/msra/parquet_test");
double[] columnSize = stat.getAvgColumnChunkSize();
List<String> names = stat.getFieldNames();
int i = 0;
double total = 0;
for (double size: columnSize)
{
//System.out.println(names.get(i) + "\t" + size);
total += size;
i++;
}
System.out.println(total/1024/1024 + "\t" + stat.getRowGroupCount() + "\t" + stat.getFileCount());
for (BlockMetaData bm : stat.getBlocks())
{
System.out.println(bm.getCompressedSize() + ", " + bm.getTotalByteSize() + ", " + bm.getRowCount());
}
List<ParquetFileMetadata> metaDatas = stat.getFileMetaData();
System.out.println(metaDatas.get(0).getFileMetaData().getCreatedBy());
Map<String, String> keyValueMetaData = metaDatas.get(0).getFileMetaData().getKeyValueMetaData();
for (String key : keyValueMetaData.keySet())
{
System.out.println(key + "=" + keyValueMetaData.get(key));
}
for (int j = 0; j < names.size(); ++j)
{
System.out.println(names.get(j) + "\t" + columnSize[j] + "\n");
}
}
开发者ID:dbiir,项目名称:rainbow,代码行数:36,代码来源:TestParquetMetadataStat.java
示例16: ParquetFileReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* @param filePath the Parquet file (will be opened for read in this constructor)
* @param blocks the blocks to read
* @param columns the columns to read (their path)
* //@param the codec used to compress the blocks
* @throws IOException if the file can not be opened
*/
public ParquetFileReader(Configuration configuration, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
this.filePath = filePath;
FileSystem fs = filePath.getFileSystem(configuration);
this.f = fs.open(filePath);
this.blocks = blocks;
for (ColumnDescriptor col : columns) {
paths.put(ColumnPath.get(col.getPath()), col);
}
this.codecFactory = new CodecFactory(configuration);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:18,代码来源:ParquetFileReader.java
示例17: readNextRowGroup
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* Reads all the columns requested from the row group at the current file position.
*
* @return the PageReadStore which can provide PageReaders for each column.
* @throws IOException if an error occurs while reading
*/
public PageReadStore readNextRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
}
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
}
ColumnChunkPageReadStore columnChunkPageReadStore = new ColumnChunkPageReadStore(block.getRowCount());
// prepare the list of consecutive chunks to read them in one scan
List<ConsecutiveChunkList> allChunks = new ArrayList<ConsecutiveChunkList>();
ConsecutiveChunkList currentChunks = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
long startingPos = mc.getStartingPos();
// first chunk or not consecutive => new list
if (currentChunks == null || currentChunks.endPos() != startingPos) {
currentChunks = new ConsecutiveChunkList(startingPos);
allChunks.add(currentChunks);
}
currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int) mc.getTotalSize()));
}
}
// actually read all the chunks
for (ConsecutiveChunkList consecutiveChunks : allChunks) {
final List<Chunk> chunks = consecutiveChunks.readAll(f);
for (Chunk chunk : chunks) {
columnChunkPageReadStore.addColumn(chunk.descriptor.col, chunk.readAllPages());
}
}
++currentBlock;
return columnChunkPageReadStore;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:43,代码来源:ParquetFileReader.java
示例18: startBlock
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* start a block
*
* @param recordCount the record count in this block
* @throws IOException
*/
public void startBlock(long recordCount) throws IOException {
state = state.startBlock();
if (DEBUG) LOG.debug(out.getPos() + ": start block");
// out.write(MAGIC); // TODO: add a magic delimiter
currentBlock = new BlockMetaData();
currentRecordCount = recordCount;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:14,代码来源:ParquetFileWriter.java
示例19: getParquetInputSplit
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
long length = 0;
for (BlockMetaData block : this.getRowGroups()) {
List<ColumnChunkMetaData> columns = block.getColumns();
for (ColumnChunkMetaData column : columns) {
if (requested.containsPath(column.getPath().toArray())) {
length += column.getTotalSize();
}
}
}
BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();
long[] rowGroupOffsets = new long[this.getRowGroupCount()];
for (int i = 0; i < rowGroupOffsets.length; i++) {
rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
}
return new ParquetInputSplit(
fileStatus.getPath(),
hdfsBlock.getOffset(),
end,
length,
hdfsBlock.getHosts(),
rowGroupOffsets
);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:31,代码来源:ParquetInputFormat.java
示例20: generateSplitInfo
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
static List<SplitInfo> generateSplitInfo(
List<BlockMetaData> rowGroupBlocks,
BlockLocation[] hdfsBlocksArray,
long minSplitSize, long maxSplitSize) {
List<SplitInfo> splitRowGroups;
if (maxSplitSize < minSplitSize || maxSplitSize < 0 || minSplitSize < 0) {
throw new ParquetDecodingException("maxSplitSize and minSplitSize should be positive and max should be greater or equal to the minSplitSize: maxSplitSize = " + maxSplitSize + "; minSplitSize is " + minSplitSize);
}
HDFSBlocks hdfsBlocks = new HDFSBlocks(hdfsBlocksArray);
hdfsBlocks.checkBelongingToANewHDFSBlock(rowGroupBlocks.get(0));
SplitInfo currentSplit = new SplitInfo(hdfsBlocks.getCurrentBlock());
//assign rowGroups to splits
splitRowGroups = new ArrayList<SplitInfo>();
checkSorted(rowGroupBlocks);//assert row groups are sorted
for (BlockMetaData rowGroupMetadata : rowGroupBlocks) {
if ((hdfsBlocks.checkBelongingToANewHDFSBlock(rowGroupMetadata)
&& currentSplit.getCompressedByteSize() >= minSplitSize
&& currentSplit.getCompressedByteSize() > 0)
|| currentSplit.getCompressedByteSize() >= maxSplitSize) {
//create a new split
splitRowGroups.add(currentSplit);//finish previous split
currentSplit = new SplitInfo(hdfsBlocks.getCurrentBlock());
}
currentSplit.addRowGroup(rowGroupMetadata);
}
if (currentSplit.getRowGroupCount() > 0) {
splitRowGroups.add(currentSplit);
}
return splitRowGroups;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:35,代码来源:ParquetInputFormat.java
注:本文中的parquet.hadoop.metadata.BlockMetaData类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论