Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ protected BaseFileWriterFactory(Table table, FileFormat dataFileFormat, Schema d
protected abstract void configureEqualityDelete(Parquet.DeleteWriteBuilder builder);
protected abstract void configurePositionDelete(Parquet.DeleteWriteBuilder builder);

// TODO: provide ways to configure ORC delete writers once we support them
protected abstract void configureDataWrite(ORC.DataWriteBuilder builder);
protected abstract void configureEqualityDelete(ORC.DeleteWriteBuilder builder);
protected abstract void configurePositionDelete(ORC.DeleteWriteBuilder builder);

@Override
public DataWriter<T> newDataWriter(EncryptedOutputFile file, PartitionSpec spec, StructLike partition) {
Expand Down Expand Up @@ -184,6 +185,22 @@ public EqualityDeleteWriter<T> newEqualityDeleteWriter(EncryptedOutputFile file,

return parquetBuilder.buildEqualityWriter();

case ORC:
ORC.DeleteWriteBuilder orcBuilder = ORC.writeDeletes(outputFile)
.setAll(properties)
.metricsConfig(metricsConfig)
.rowSchema(equalityDeleteRowSchema)
.equalityFieldIds(equalityFieldIds)
.withSpec(spec)
.withPartition(partition)
.withKeyMetadata(keyMetadata)
.withSortOrder(equalityDeleteSortOrder)
.overwrite();

configureEqualityDelete(orcBuilder);

return orcBuilder.buildEqualityWriter();

default:
throw new UnsupportedOperationException("Unsupported format for equality deletes: " + deleteFileFormat);
}
Expand Down Expand Up @@ -230,6 +247,20 @@ public PositionDeleteWriter<T> newPositionDeleteWriter(EncryptedOutputFile file,

return parquetBuilder.buildPositionWriter();

case ORC:
ORC.DeleteWriteBuilder orcBuilder = ORC.writeDeletes(outputFile)
.setAll(properties)
.metricsConfig(metricsConfig)
.rowSchema(positionDeleteRowSchema)
.withSpec(spec)
.withPartition(partition)
.withKeyMetadata(keyMetadata)
.overwrite();

configurePositionDelete(orcBuilder);

return orcBuilder.buildPositionWriter();

default:
throw new UnsupportedOperationException("Unsupported format for position deletes: " + deleteFileFormat);
}
Expand Down
11 changes: 11 additions & 0 deletions data/src/main/java/org/apache/iceberg/data/DeleteFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@
import org.apache.iceberg.StructLike;
import org.apache.iceberg.avro.Avro;
import org.apache.iceberg.data.avro.DataReader;
import org.apache.iceberg.data.orc.GenericOrcReader;
import org.apache.iceberg.data.parquet.GenericParquetReaders;
import org.apache.iceberg.deletes.Deletes;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
Expand Down Expand Up @@ -227,6 +229,15 @@ private CloseableIterable<Record> openDeletes(DeleteFile deleteFile, Schema dele
return builder.build();

case ORC:
ORC.ReadBuilder orcBuilder = ORC.read(input)
.project(deleteSchema)
.createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema));

if (deleteFile.content() == FileContent.POSITION_DELETES) {
orcBuilder.filter(Expressions.equal(MetadataColumns.DELETE_FILE_PATH.name(), dataFile.path()));
}

return orcBuilder.build();
default:
throw new UnsupportedOperationException(String.format(
"Cannot read deletes, %s is not a supported format: %s", deleteFile.format().name(), deleteFile.path()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ public void writeFile() throws IOException {
.createWriterFunc(GenericOrcWriter::buildWriter)
.schema(DATA_SCHEMA)
// write in such a way that the file contains 2 stripes each with 4 row groups of 1000 rows
.config("iceberg.orc.vectorbatch.size", "1000")
.config(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000")
.config(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "4000")
.config(OrcConf.STRIPE_SIZE.getAttribute(), "1")
.set("iceberg.orc.vectorbatch.size", "1000")
.set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000")
.set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "4000")
.set(OrcConf.STRIPE_SIZE.getAttribute(), "1")
.build()) {
writer.addAll(DATA_ROWS);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.avro.DataReader;
import org.apache.iceberg.data.orc.GenericOrcReader;
import org.apache.iceberg.data.parquet.GenericParquetReaders;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
import org.apache.iceberg.deletes.PositionDelete;
import org.apache.iceberg.deletes.PositionDeleteWriter;
import org.apache.iceberg.encryption.EncryptedOutputFile;
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
Expand Down Expand Up @@ -131,8 +133,6 @@ public void testDataWriter() throws IOException {

@Test
public void testEqualityDeleteWriter() throws IOException {
Assume.assumeFalse("ORC delete files are not supported", fileFormat == FileFormat.ORC);

List<Integer> equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId());
Schema equalityDeleteRowSchema = table.schema().select("id");
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema);
Expand Down Expand Up @@ -179,7 +179,6 @@ public void testEqualityDeleteWriter() throws IOException {

@Test
public void testEqualityDeleteWriterWithMultipleSpecs() throws IOException {
Assume.assumeFalse("ORC delete files are not supported", fileFormat == FileFormat.ORC);
Assume.assumeFalse("Table must start unpartitioned", partitioned);

List<Integer> equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId());
Expand Down Expand Up @@ -242,8 +241,6 @@ public void testEqualityDeleteWriterWithMultipleSpecs() throws IOException {

@Test
public void testPositionDeleteWriter() throws IOException {
Assume.assumeFalse("ORC delete files are not supported", fileFormat == FileFormat.ORC);

FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());

// write a data file
Expand Down Expand Up @@ -400,6 +397,15 @@ private List<Record> readFile(Schema schema, InputFile inputFile) throws IOExcep
return ImmutableList.copyOf(records);
}

case ORC:
try (CloseableIterable<Record> records = ORC.read(inputFile)
.project(schema)
.createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema))
.build()) {

return ImmutableList.copyOf(records);
}

default:
throw new UnsupportedOperationException("Unsupported read file format: " + fileFormat);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ protected void configureDataWrite(ORC.DataWriteBuilder builder) {
builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema));
}

@Override
protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) {
builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema));
}

@Override
protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) {
builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema));
}

private RowType dataFlinkType() {
if (dataFlinkType == null) {
Preconditions.checkNotNull(dataSchema(), "Data schema must not be null");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.iceberg.data.orc;

import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
Expand All @@ -36,6 +37,8 @@
import org.apache.iceberg.DoubleFieldMetrics;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.FloatFieldMetrics;
import org.apache.iceberg.deletes.PositionDelete;
import org.apache.iceberg.orc.OrcRowWriter;
import org.apache.iceberg.orc.OrcValueWriter;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
Expand All @@ -49,6 +52,7 @@
import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
import org.apache.orc.storage.ql.exec.vector.MapColumnVector;
import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;

public class GenericOrcWriters {
private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC);
Expand Down Expand Up @@ -135,6 +139,10 @@ public static <K, V> OrcValueWriter<Map<K, V>> map(OrcValueWriter<K> key, OrcVal
return new MapWriter<>(key, value);
}

public static PositionDeleteWriter positionDelete(OrcRowWriter writer) {
return new PositionDeleteWriter(writer);
}

private static class BooleanWriter implements OrcValueWriter<Boolean> {
private static final OrcValueWriter<Boolean> INSTANCE = new BooleanWriter();

Expand Down Expand Up @@ -531,6 +539,32 @@ public Stream<FieldMetrics<?>> metrics() {
}
}

private static class PositionDeleteWriter<T> implements OrcRowWriter<PositionDelete<T>> {
private final OrcRowWriter<T> rowWriter;

PositionDeleteWriter(OrcRowWriter rowWriter) {
this.rowWriter = rowWriter;
}

@Override
public void write(PositionDelete<T> row, VectorizedRowBatch output) throws IOException {
int rowId = output.size;

GenericOrcWriters.strings().write(rowId, row.path().toString(), output.cols[0]);
GenericOrcWriters.longs().write(rowId, row.pos(), output.cols[1]);
if (row.row() != null) {
rowWriter.write(row.row(), output);
} else {
output.size += 1;
}
}

@Override
public Stream<FieldMetrics<?>> metrics() {
return rowWriter.metrics();
}
}

private static void growColumnVector(ColumnVector cv, int requestedSize) {
if (cv.isNull.length < requestedSize) {
// Use growth factor of 3 to avoid frequent array allocations
Expand Down
Loading