文件系统 - Apache Flink
文件系统 - Apache Flink
File Source
Format Types
File Sink
Format Types
Part
File Source
File Source Source API Source File Source
SplitEnumerator SourceReader
SplitEnumerator SourceReader
SourceReader
Java Python
//
FileSource.forRecordStreamFormat(StreamFormat,Path...);
//
FileSource.forBulkFileFormat(BulkFormat,Path...);
Java Python
Format Types
file formats readers Source
/
StreamFormat
Checkpoint
BulkFormat “ ”
TextLine Format
StreamFormat Java InputStreamReader
Checkpoint
Checkpoint
SimpleStreamFormat
StreamFormat SimpleStreamFormat
Java
@Override
public Reader<byte[]> createReader(Configuration config, FSDataInputStream stre
throws IOException {
return new ArrayReader(stream);
}
@Override
public TypeInformation<byte[]> getProducedType() {
return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
}
}
CsvReaderFormat SimpleStreamFormat
CSV CsvReaderFormat
forSchema
CsvReaderFormat<T> forSchema(Supplier<CsvMapper> mapperFactory,
Function<CsvMapper, CsvSchema> schemaGenerator,
TypeInformation<T> typeInformation)
Bulk Format
BulkFormat BulkFormat ORC Format Parquet Format
BulkFormat reader BulkFormat.Reader
BulkFormat#createReader(Configuration, FileSourceSplit)
checkpoint checkpoint Bulk reader reader
BulkFormat#restoreReader(Configuration, FileSourceSplit)
Java
/**
* Hive FileEnumerator HiveTablePartition
*/
public class HiveSourceFileEnumerator implements FileEnumerator {
//
public HiveSourceFileEnumerator(...) {
...
}
/***
* {@code
* minDesiredSplits}
*/
@Override
public Collection<FileSourceSplit> enumerateSplits(Path[] paths, int minDesired
throws IOException {
// createInputSplits:splitting files into fragmented collections
return new ArrayList<>(createInputSplits(...));
}
...
/***
* HiveSourceFileEnumerator
*/
public static class Provider implements FileEnumerator.Provider {
...
@Override
public FileEnumerator create() {
return new HiveSourceFileEnumerator(...);
}
}
}
//
new HiveSource<>(
...,
new HiveSourceFileEnumerator.Provider(
partitions != null ? partitions : Collections.emptyList(),
new JobConfWrapper(jobConf)),
...);
Watermark Watermark
Watermark
File Sink
File Sink
Part
JavaDoc FileSink
Row-encoded Formats
Row-encoded Format Encoder
OutputStream
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.core.fs.Path;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.Def
import java.time.Duration;
DataStream<String> input = ...;
input.sinkTo(sink);
Sink
In-progress
15
5
1GB
Bulk-encoded Formats
Bulk-encoded Sink Row-encoded Encoder
BulkWriter.Factory BulkWriter.Factory BulkWriter
Flink 5 BulkWriter
ParquetWriterFactory
AvroWriterFactory
SequenceFileWriterFactory
CompressWriterFactory
OrcBulkWriterFactory
Parquet Format
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.12</artifactId>
<version>1.16.0</version>
</dependency>
PyFlink JAR
Download
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.formats.parquet.avro.AvroParquetWriters;
import org.apache.avro.Schema;
input.sinkTo(sink);
Java Scala
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.formats.parquet.protobuf.ParquetProtoWriters;
// ProtoRecord protobuf
DataStream<ProtoRecord> input = ...;
input.sinkTo(sink);
PyFlink ParquetBulkWriters Row Parquet
BulkWriterFactory
row_type = DataTypes.ROW([
DataTypes.FIELD('string', DataTypes.STRING()),
DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
])
sink = FileSink.for_bulk_format(
OUTPUT_DIR, ParquetBulkWriters.for_row_type(
row_type,
hadoop_config=Configuration(),
utc_timestamp=True,
)
).build()
ds.sink_to(sink)
Avro Format
AvroWriters
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-avro</artifactId>
<version>1.16.0</version>
</dependency>
PyFlink JAR
Download
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.formats.avro.AvroWriters;
import org.apache.avro.Schema;
Schema schema = ...;
DataStream<GenericRecord> input = ...;
input.sinkTo(sink);
Java Scala
ORC Format
VectorizedRowBatch Vectorizer
vectorize(T element, VectorizedRowBatch batch)
VectorizedRowBatch element ColumnVectors
VectorizedRowBatch
Person
Java
class Person {
private final String name;
private final int age;
...
}
Person VectorizedRowBatch
Java Scala
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_2.12</artifactId>
<version>1.16.0</version>
</dependency>
FileSink ORC Format
Java Scala
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.orc.writer.OrcBulkWriterFactory;
input.sinkTo(sink);
Java Scala
writerProperties.setProperty("orc.compress", "LZ4");
// ORC
ORC
Java Scala
public class PersonVectorizer extends Vectorizer<Person> implements Serializable {
@Override
public void vectorize(Person element, VectorizedRowBatch batch) throws IOEx
...
String metadataKey = ...;
ByteBuffer metadataValue = ...;
this.addUserMetadata(metadataKey, metadataValue);
}
}
PyFlink JAR
Download
row_type = DataTypes.ROW([
DataTypes.FIELD('name', DataTypes.STRING()),
DataTypes.FIELD('age', DataTypes.INT()),
])
sink = FileSink.for_bulk_format(
OUTPUT_DIR,
OrcBulkWriters.for_row_type(
row_type=row_type,
writer_properties=Configuration(),
hadoop_config=Configuration(),
)
).build()
ds.sink_to(sink)
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sequence-file</artifactId>
<version>1.16.0</version>
</dependency>
SequenceFile
Java Scala
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
input.sinkTo(sink);
SequenceFileWriterFactory
.withBucketAssigner(assigner)
BucketAssigner
Flink BucketAssigners
DateTimeBucketAssigner
BasePathBucketAssigner
DefaultRollingPolicy
OnCheckpointRollingPolicy
Part
FileSink
Part
Finished
Part
2 Sink Subtask
└── 2019-08-25--12
├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-
└── part-81fc4980-a6af-41c8-9937-9939408a734b-0.inprogress.ea65a428-a1d0-4a0b-bbc5-
Part part-81fc4980-a6af-41c8-9937-9939408a734b-0
Pending Sink Part part-81fc4980-
a6af-41c8-9937-9939408a734b-1
└── 2019-08-25--12
├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-
├── part-81fc4980-a6af-41c8-9937-9939408a734b-0.inprogress.ea65a428-a1d0-4a0b-bbc5-
└── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-
└── 2019-08-25--12
├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-
├── part-81fc4980-a6af-41c8-9937-9939408a734b-0
└── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-
In-progress
└── 2019-08-25--12
├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-
├── part-81fc4980-a6af-41c8-9937-9939408a734b-0
└── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-
└── 2019-08-25--13
└── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.2b475fec-1482-4dea-9946-
Part
Finished In-progress
└── 2019-08-25--12
├── prefix-4005733d-a830-4323-8291-8866de98b582-0.ext
├── prefix-4005733d-a830-4323-8291-8866de98b582-1.ext.inprogress.bd053eb0-5ecf-4c85
├── prefix-81fc4980-a6af-41c8-9937-9939408a734b-0.ext
└── prefix-81fc4980-a6af-41c8-9937-9939408a734b-1.ext.inprogress.bc279efe-b16f-47d8
OutputFileConfig
FileSink<Integer> fileSink=
FileSink.forRowFormat(new Path(path),new SimpleStringEncoder<Integer>())
.enableCompact(
FileCompactStrategy.Builder.newBuilder()
.setNumCompactThreads(1024)
.enableCompactionOnCheckpoint(5)
.build(),
new RecordWiseFileCompactor<>(
new DecoderBasedReader.Factory<>(SimpleStringDecoder::new)))
.build();
pending pending
.
pending Committer
FileCompactStrategy FileCompactor
FileCompactStrategy
Checkpoint Checkpoint
FileSink
FileCompactor
OutputStreamBasedFileCompactor :
CompactingFileWriter
ConcatFileCompactor
RecordWiseFileCompactor CompactingFileWriter
FileWriter CompactingFileWriter
RecordWiseFileCompactor CompactingFileWriter
1 FileSink
disableCompact
2
BATCH
1 Writer parallelism Committer parallelism = 1
2 Pending Finished
3 Committers JobManager
Flink FLIP-147
S3
1 S3 FileSink Hadoop-based Presto
Job FileSink S3 Presto Sink Checkpoint
“s3a://” Hadoop Sink “s3p://” Checkpoint
Presto Sink Checkpoint “s3://” “
”
Back to top