apache · RussellSpitzer · Dec 14, 2021 · Nov 13, 2021 · Nov 15, 2021 · Nov 15, 2021
diff --git a/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java b/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java
@@ -71,6 +71,7 @@ public abstract class DeleteFilter<T> {
   private final Accessor<StructLike> posAccessor;
 
   private PositionDeleteIndex deleteRowPositions = null;
+  private Predicate<T> eqDeleteRows = null;
 
   protected DeleteFilter(FileScanTask task, Schema tableSchema, Schema requestedSchema) {
     this.setFilterThreshold = DEFAULT_SET_FILTER_THRESHOLD;
@@ -105,6 +106,10 @@ public boolean hasPosDeletes() {
     return !posDeletes.isEmpty();
   }
 
+  public boolean hasEqDeletes() {
+    return !eqDeletes.isEmpty();
+  }
+
   Accessor<StructLike> posAccessor() {
     return posAccessor;
   }
@@ -192,6 +197,16 @@ protected boolean shouldKeep(T item) {
     return remainingRowsFilter.filter(records);
   }
 
+  public Predicate<T> eqDeletedRowFilter() {
+    if (eqDeleteRows == null) {
+      eqDeleteRows = applyEqDeletes().stream()
+          .map(Predicate::negate)
+          .reduce(Predicate::and)
+          .orElse(t -> true);
+    }
+    return eqDeleteRows;
+  }
+
   public PositionDeleteIndex deletedRowPositions() {
     if (posDeletes.isEmpty()) {
       return null;

diff --git a/...v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java b/...v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java
@@ -33,15 +33,18 @@
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.deletes.PositionDelete;
 import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.io.ClusteredEqualityDeleteWriter;
 import org.apache.iceberg.io.ClusteredPositionDeleteWriter;
 import org.apache.iceberg.io.OutputFileFactory;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.TearDown;
@@ -190,6 +193,56 @@ writerFactory, fileFactory, table().io(),
     rowDelta.validateDeletedFiles().commit();
   }
 
+  protected void writeEqDeletes(long numRows, double percentage) throws IOException {
+    Set<Long> deletedValues = Sets.newHashSet();
+    while (deletedValues.size() < numRows * percentage) {
+      deletedValues.add(ThreadLocalRandom.current().nextLong(numRows));
+    }
+
+    List<InternalRow> rows = Lists.newArrayList();
+    for (Long value : deletedValues) {
+      GenericInternalRow genericInternalRow = new GenericInternalRow(7);
+      genericInternalRow.setLong(0, value);
+      genericInternalRow.setInt(1, (int) (value % Integer.MAX_VALUE));
+      genericInternalRow.setFloat(2, (float) value);
+      genericInternalRow.setNullAt(3);
+      genericInternalRow.setNullAt(4);
+      genericInternalRow.setNullAt(5);
+      genericInternalRow.setNullAt(6);
+      rows.add(genericInternalRow);
+    }
+    LOG.info("Num of equality deleted rows: {}", rows.size());
+
+    writeEqDeletes(rows);
+  }
+
+  private void writeEqDeletes(List<InternalRow> rows) throws IOException {
+    int equalityFieldId = table().schema().findField("longCol").fieldId();
+
+    OutputFileFactory fileFactory = newFileFactory();
+    SparkFileWriterFactory writerFactory = SparkFileWriterFactory
+        .builderFor(table())
+        .dataFileFormat(fileFormat())
+        .equalityDeleteRowSchema(table().schema())
+        .equalityFieldIds(new int[]{equalityFieldId})
+        .build();
+
+    ClusteredEqualityDeleteWriter<InternalRow> writer = new ClusteredEqualityDeleteWriter<>(
+        writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
+
+    PartitionSpec unpartitionedSpec = table().specs().get(0);
+    try (ClusteredEqualityDeleteWriter<InternalRow> closeableWriter = writer) {
+      for (InternalRow row : rows) {
+        closeableWriter.write(row, unpartitionedSpec, null);
+      }
+    }
+
+    RowDelta rowDelta = table().newRowDelta();
+    LOG.info("Num of Delete File: {}", writer.result().deleteFiles().size());
+    writer.result().deleteFiles().forEach(rowDelta::addDeletes);
+    rowDelta.validateDeletedFiles().commit();
+  }
+
   private OutputFileFactory newFileFactory() {
     return OutputFileFactory.builderFor(table(), 1, 1)
         .format(fileFormat())

diff --git a/...h/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java b/...h/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source.parquet;
+
+import java.io.IOException;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.spark.source.IcebergSourceDeleteBenchmark;
+import org.openjdk.jmh.annotations.Param;
+
+/**
+ * A benchmark that evaluates the non-vectorized read and vectorized read with equality delete in the Spark data source
+ * for Iceberg.
+ * <p>
+ * This class uses a dataset with a flat schema.
+ * To run this benchmark for spark-3.2:
+ * <code>
+ * ./gradlew :iceberg-spark:iceberg-spark-3.2:jmh
+ *   -PjmhIncludeRegex=IcebergSourceParquetEqDeleteBenchmark
+ *   -PjmhOutputPath=benchmark/iceberg-source-parquet-eq-delete-benchmark-result.txt
+ * </code>
+ */
+public class IcebergSourceParquetEqDeleteBenchmark extends IcebergSourceDeleteBenchmark {
+  @Param({"0", "0.000001", "0.05", "0.25", "0.5", "1"})
+  private double percentDeleteRow;
+
+  @Override
+  protected void appendData() throws IOException {
+    for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+      writeData(fileNum);
+
+      if (percentDeleteRow > 0) {
+        // add equality deletes
+        table().refresh();
+        writeEqDeletes(NUM_ROWS, percentDeleteRow);
+      }
+    }
+  }
+
+  @Override
+  protected FileFormat fileFormat() {
+    return FileFormat.PARQUET;
+  }
+}
diff --git a/.../IcebergSourceParquetDeleteBenchmark.java → ...ebergSourceParquetPosDeleteBenchmark.java b/.../IcebergSourceParquetDeleteBenchmark.java → ...ebergSourceParquetPosDeleteBenchmark.java
@@ -33,11 +33,11 @@
  * To run this benchmark for spark-3.2:
  * <code>
  * ./gradlew :iceberg-spark:iceberg-spark-3.2:jmh
- *   -PjmhIncludeRegex=IcebergSourceParquetDeleteBenchmark
- *   -PjmhOutputPath=benchmark/iceberg-source-parquet-delete-benchmark-result.txt
+ *   -PjmhIncludeRegex=IcebergSourceParquetPosDeleteBenchmark
+ *   -PjmhOutputPath=benchmark/iceberg-source-parquet-pos-delete-benchmark-result.txt
  * </code>
  */
-public class IcebergSourceParquetDeleteBenchmark extends IcebergSourceDeleteBenchmark {
+public class IcebergSourceParquetPosDeleteBenchmark extends IcebergSourceDeleteBenchmark {
   @Param({"0", "0.000001", "0.05", "0.25", "0.5", "1"})
   private double percentDeleteRow;