本文整理汇总了Java中org.kitesdk.data.DatasetDescriptor类的典型用法代码示例。如果您正苦于以下问题:Java DatasetDescriptor类的具体用法?Java DatasetDescriptor怎么用?Java DatasetDescriptor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
DatasetDescriptor类属于org.kitesdk.data包,在下文中一共展示了DatasetDescriptor类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: SavePolicy
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
private SavePolicy(Context context) {
String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI);
Preconditions.checkArgument(uri != null, "Must set "
+ CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY
+ "=save");
if (Datasets.exists(uri)) {
dataset = Datasets.load(uri, AvroFlumeEvent.class);
} else {
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(AvroFlumeEvent.class)
.build();
dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class);
}
nEventsHandled = 0;
}
开发者ID:moueimei,项目名称:flume-release-1.7.0,代码行数:17,代码来源:SavePolicy.java
示例2: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(SmallEvent.class)
.build();
dataset(EVENTS_DS_URI, descriptor);
Topics.createTopic(context, TOPIC_NAME, 1, 1, SmallEvent.getClassSchema());
StreamDescription streamDescription = new StreamDescription.Builder()
.jobName("simple-spark-streaming")
.jobClass(StreamingSparkJob.class)
.withStream("event_stream", Topics.topic(TOPIC_NAME))
.withView("event_output", EVENTS_DS_URI)
.build();
stream(streamDescription);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:21,代码来源:StreamingSparkApp.java
示例3: dataset
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
/**
* Ensures the given dataset exists, creating it if it doesn't
* and updating the schema if necessary.
*/
protected void dataset(String uri, DatasetDescriptor descriptor) {
try {
Datasets.create(uri, descriptor);
} catch (DatasetExistsException e) {
Dataset existingDataset = Datasets.load(uri);
DatasetDescriptor updated;
// The given discriptor might not have a location,
// so use the current one.
if (descriptor.getLocation() == null) {
updated = new DatasetDescriptor.Builder(descriptor)
.location(existingDataset.getDescriptor().getLocation())
.build();
} else {
updated = descriptor;
}
Datasets.update(uri, updated);
}
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:30,代码来源:AbstractApplication.java
示例4: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
public void setup(AppContext context) {
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(KeyValues.getClassSchema())
.build();
dataset(OUTPUT_DATASET, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("write-config-job")
.jobClass(WriteConfigOutputJob.class)
.frequency("0 * * * *")
.withOutput("kv-output", OUTPUT_URI_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:19,代码来源:WriteConfigOutputApp.java
示例5: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(ExampleEvent.getClassSchema())
.build();
dataset(EVENTS_DS_URI, descriptor);
Topics.createTopic(context, TOPIC_NAME, 1, 1, ExampleEvent.getClassSchema());
StreamDescription streamDescription = new StreamDescription.Builder()
.jobName("test-event-stream")
.jobClass(TopicToDatasetJob.class)
.withStream("event_stream", Topics.topic(TOPIC_NAME))
.withView("event_output", EVENTS_DS_URI)
.build();
stream(streamDescription);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:21,代码来源:TopicToDatasetApp.java
示例6: run
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public int run(List<String> args) throws Exception {
String inputUri = uri;
String outputUri = "dataset:hive?dataset=correlated_events";
if (args.size() == 1) {
outputUri = args.get(0);
}
Preconditions.checkState(Datasets.exists(inputUri),
"input dataset doesn't exists");
if (!Datasets.exists(outputUri)) {
Datasets.create(outputUri, new DatasetDescriptor.Builder()
.format("avro")
.schema(CorrelatedEvents.class)
.build());
}
CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri);
task.run();
return 0;
}
开发者ID:kite-sdk,项目名称:kite-examples,代码行数:25,代码来源:CorrelateEvents.java
示例7: run
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public int run(List<String> args) throws Exception {
Preconditions.checkState(!Datasets.exists(uri),
"events dataset already exists");
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(StandardEvent.class).build();
View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class);
DatasetWriter<StandardEvent> writer = events.newWriter();
try {
while (System.currentTimeMillis() - baseTimestamp < 36000) {
writer.write(generateRandomEvent());
}
} finally {
writer.close();
}
System.out.println("Generated " + counter + " events");
return 0;
}
开发者ID:kite-sdk,项目名称:kite-examples,代码行数:24,代码来源:CreateEvents.java
示例8: testParquetDataset
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testParquetDataset() throws EventDeliveryException {
Datasets.delete(FILE_DATASET_URI);
Dataset<GenericRecord> created = Datasets.create(FILE_DATASET_URI,
new DatasetDescriptor.Builder(DESCRIPTOR)
.format("parquet")
.build());
DatasetSink sink = sink(in, config);
// run the sink
sink.start();
sink.process();
// the transaction should not commit during the call to process
assertThrows("Transaction should still be open", IllegalStateException.class,
new Callable() {
@Override
public Object call() throws EventDeliveryException {
in.getTransaction().begin();
return null;
}
});
// The records won't commit until the call to stop()
Assert.assertEquals("Should not have committed", 0, read(created).size());
sink.stop();
Assert.assertEquals(Sets.newHashSet(expected), read(created));
Assert.assertEquals("Should have committed", 0, remaining(in));
}
开发者ID:moueimei,项目名称:flume-release-1.7.0,代码行数:32,代码来源:TestDatasetSink.java
示例9: testPartitionedData
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testPartitionedData() throws EventDeliveryException {
URI partitionedUri = URI.create("dataset:file:target/test_repo/partitioned");
try {
Datasets.create(partitionedUri, new DatasetDescriptor.Builder(DESCRIPTOR)
.partitionStrategy(new PartitionStrategy.Builder()
.identity("id", 10) // partition by id
.build())
.build());
config.put(DatasetSinkConstants.CONFIG_KITE_DATASET_URI,
partitionedUri.toString());
DatasetSink sink = sink(in, config);
// run the sink
sink.start();
sink.process();
sink.stop();
Assert.assertEquals(
Sets.newHashSet(expected),
read(Datasets.load(partitionedUri)));
Assert.assertEquals("Should have committed", 0, remaining(in));
} finally {
if (Datasets.exists(partitionedUri)) {
Datasets.delete(partitionedUri);
}
}
}
开发者ID:moueimei,项目名称:flume-release-1.7.0,代码行数:30,代码来源:TestDatasetSink.java
示例10: createDataset
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
private static Dataset createDataset(Schema schema,
CompressionType compressionType, String uri) {
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(schema)
.format(Formats.PARQUET)
.compressionType(compressionType)
.build();
return Datasets.create(uri, descriptor, GenericRecord.class);
}
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:10,代码来源:ParquetJob.java
示例11: configureInputFormat
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
protected void configureInputFormat(Job job, String tableName, String tableClassName,
String splitByCol) throws ClassNotFoundException, IOException {
fileType = getInputFileType();
super.configureInputFormat(job, tableName, tableClassName, splitByCol);
if (isHCatJob) {
SqoopHCatUtilities.configureExportInputFormat(options, job, context.getConnManager(),
tableName, job.getConfiguration());
return;
} else if (fileType == FileType.AVRO_DATA_FILE) {
LOG.debug("Configuring for Avro export");
configureGenericRecordExportInputFormat(job, tableName);
} else if (fileType == FileType.PARQUET_FILE) {
LOG.debug("Configuring for Parquet export");
configureGenericRecordExportInputFormat(job, tableName);
FileSystem fs = FileSystem.get(job.getConfiguration());
String uri = "dataset:" + fs.makeQualified(getInputPath());
Exception caughtException = null;
try {
DatasetKeyInputFormat.configure(job).readFrom(uri);
} catch (DatasetNotFoundException e) {
LOG.warn(e.getMessage(), e);
LOG.warn("Trying to get data schema from parquet file directly");
caughtException = e;
}
if (caughtException != null && caughtException instanceof DatasetNotFoundException) {
DatasetDescriptor descriptor = getDatasetDescriptorFromParquetFile(job, fs, uri);
Dataset dataset = Datasets.create(uri, descriptor, GenericRecord.class);
DatasetKeyInputFormat.configure(job).readFrom(dataset);
}
}
FileInputFormat.addInputPath(job, getInputPath());
}
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:37,代码来源:HdfsOdpsImportJob.java
示例12: getDatasetDescriptorFromParquetFile
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
throws IOException {
ArrayList<FileStatus> files = new ArrayList<FileStatus>();
FileStatus[] dirs;
dirs = fs.globStatus(fs.makeQualified(getInputPath()));
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
// We only check one file, so exit the loop when we have at least
// one.
if (files.size() > 0) {
break;
}
}
ParquetMetadata parquetMetadata;
try {
parquetMetadata =
ParquetFileReader.readFooter(job.getConfiguration(),
fs.makeQualified(files.get(0).getPath()));
} catch (IOException e) {
LOG.error("Wrong file format. Please check the export file's format.", e);
throw e;
}
MessageType schema = parquetMetadata.getFileMetaData().getSchema();
Schema avroSchema = new AvroSchemaConverter().convert(schema);
DatasetDescriptor descriptor =
new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
.compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
return descriptor;
}
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:32,代码来源:HdfsOdpsImportJob.java
示例13: createDataset
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
/**
* Creates a new dataset.
*/
public static Dataset<GenericRecord> createDataset(String uri, org.apache.sqoop.schema.Schema schema,
FileFormat format) {
Schema datasetSchema = KiteDataTypeUtil.createAvroSchema(schema);
Format datasetFormat = KiteDataTypeUtil.toFormat(format);
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.property("kite.allow.csv", "true")
.schema(datasetSchema)
.format(datasetFormat)
.build();
return Datasets.create(uri, descriptor);
}
开发者ID:vybs,项目名称:sqoop-on-spark,代码行数:15,代码来源:KiteDatasetExecutor.java
示例14: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
// Create the input and output datasets.
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(DatasetTestUtilities.USER_SCHEMA)
.partitionStrategy(strategy)
.build();
dataset(INPUT_DATASET, descriptor);
dataset(OUTPUT_DATASET, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("simple-spark-job")
.jobClass(SimpleSparkJob.class)
.frequency("0 * * * *")
.withInput("source.users", INPUT_URI_PATTERN, "* * * * *")
.withOutput("target.users", OUTPUT_URI_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:31,代码来源:SimpleSparkApp.java
示例15: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
public void setup(AppContext context) {
// Create the input and output datasets.
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(DatasetTestUtilities.USER_SCHEMA)
.partitionStrategy(strategy)
.build();
dataset(INPUT_DATASET, descriptor);
dataset(OUTPUT_DATASET, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("test-job")
.jobClass(ScheduledInputOutputJob.class)
.frequency("0 * * * *")
.withInput("source_users", INPUT_URI_PATTERN, "* * * * *")
.withOutput("target_users", OUTPUT_URI_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:30,代码来源:ScheduledInputOutputApp.java
示例16: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(DatasetTestUtilities.USER_SCHEMA)
.partitionStrategy(strategy)
.build();
// Create the input and output datasets
dataset(INPUT1_DATASET, descriptor);
dataset(INPUT2_DATASET, descriptor);
dataset(OUTPUT_DATASET, descriptor);
Schedule.Builder builder = new Schedule.Builder()
.jobName("test_job")
.jobClass(DynamicInputOutputJob.class)
.frequency("*/5 * * * *")
.withInput("input1", INPUT1_URI_PATTERN, "*/5 * * * *", GenericRecord.class)
.withInput("input2", INPUT2_URI_PATTERN, "*/5 * * * *", GenericRecord.class)
.withOutput("output", OUTPUT_URI_PATTERN, GenericRecord.class);
schedule(builder.build());
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:31,代码来源:DynamicInputOutputApp.java
示例17: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
public void setup(AppContext context) {
// Create a dataset to contain items partitioned by event.
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.provided("minute", "int")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(ExampleEvent.getClassSchema())
.partitionStrategy(strategy)
.build();
dataset(ODD_USER_DS_URI, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("example-spark")
.jobClass(SparkJob.class)
.frequency("* * * * *")
.withInput("example_events", EVENT_URI_PATTERN, "* * * * *")
.withOutput("odd_users", ODD_USER_URI_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:30,代码来源:SparkApp.java
示例18: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
// Create a dataset to contain items partitioned by event.
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.provided("minute", "int")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(ExampleEvent.getClassSchema())
.partitionStrategy(strategy)
.build();
dataset(ODD_USER_DS_URI, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("example-triggered")
.jobClass(TriggeredJob.class)
.frequency("* * * * *")
.withInput("example_events", EVENT_URI_PATTERN, "* * * * *")
.withOutput("odd_users", ODD_USER_URI_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:31,代码来源:TriggeredApp.java
示例19: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
public void setup(AppContext context) {
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("year", "int")
.provided("month", "int")
.provided("day", "int")
.provided("hour", "int")
.provided("minute", "int")
.build();
// Create our test dataset.
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(SCHEMA)
.partitionStrategy(strategy)
.build();
dataset(REPORT_DS_URI, descriptor);
// Schedule our report to run every five minutes.
Schedule schedule = new Schedule.Builder()
.jobName("example-scheduled-report")
.jobClass(ScheduledReportJob.class)
.frequency("*/5 * * * *")
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:28,代码来源:ScheduledReportApp.java
示例20: setup
import org.kitesdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void setup(AppContext context) {
// Create the test dataset, partitioned by the minute
// so we quickly createSchedulable data.
PartitionStrategy strategy = new PartitionStrategy.Builder()
.year("timestamp")
.month("timestamp")
.day("timestamp")
.hour("timestamp")
.minute("timestamp")
.build();
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(ExampleEvent.getClassSchema())
.partitionStrategy(strategy)
.build();
dataset(EVENT_DS_URI, descriptor);
// Schedule our data generation job to run every minute. We wire
// the job's "generate.target" argument to the pattern so it will
// be invoked with the corresponding view.
Schedule schedule = new Schedule.Builder()
.jobName("example-data-generator")
.jobClass(DataGeneratorJob.class)
.frequency("* * * * *")
.withOutput("example_events", EVENT_DS_PATTERN)
.build();
schedule(schedule);
}
开发者ID:rbrush,项目名称:kite-apps,代码行数:33,代码来源:DataGeneratorApp.java
注:本文中的org.kitesdk.data.DatasetDescriptor类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论