Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ public class ParquetRewriterTest {
private final IndexCache.CacheStrategy indexCacheStrategy;
private final boolean usingHadoop;

private List<EncryptionTestFile> inputFiles = null;
private List<EncryptionTestFile> inputFiles = Lists.newArrayList();
private String outputFile = null;
private ParquetRewriter rewriter = null;

private final EncryptionTestFile gzipEncryptionTestFileWithoutBloomFilterColumn;
private final EncryptionTestFile uncompressedEncryptionTestFileWithoutBloomFilterColumn;

@Parameterized.Parameters(name = "WriterVersion = {0}, IndexCacheStrategy = {1}, UsingHadoop = {2}")
public static Object[][] parameters() {
return new Object[][] {
Expand All @@ -121,10 +124,26 @@ public static Object[][] parameters() {
};
}

public ParquetRewriterTest(String writerVersion, String indexCacheStrategy, boolean usingHadoop) {
public ParquetRewriterTest(String writerVersion, String indexCacheStrategy, boolean usingHadoop)
throws IOException {
this.writerVersion = ParquetProperties.WriterVersion.fromString(writerVersion);
this.indexCacheStrategy = IndexCache.CacheStrategy.valueOf(indexCacheStrategy);
this.usingHadoop = usingHadoop;

MessageType testSchema = createSchema();
this.gzipEncryptionTestFileWithoutBloomFilterColumn = new TestFileBuilder(conf, testSchema)
.withNumRecord(numRecord)
.withCodec("GZIP")
.withPageSize(1024)
.withWriterVersion(this.writerVersion)
.build();

this.uncompressedEncryptionTestFileWithoutBloomFilterColumn = new TestFileBuilder(conf, testSchema)
.withNumRecord(numRecord)
.withCodec("UNCOMPRESSED")
.withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE)
.withWriterVersion(this.writerVersion)
.build();
}

private void testPruneSingleColumnTranslateCodec(List<Path> inputPaths) throws Exception {
Expand All @@ -141,7 +160,7 @@ private void testPruneSingleColumnTranslateCodec(List<Path> inputPaths) throws E
rewriter.processBlocks();
rewriter.close();

// Verify the schema are not changed for the columns not pruned
// Verify the schema is not changed for the columns not pruned
validateSchema();

// Verify codec has been translated
Expand Down Expand Up @@ -179,7 +198,7 @@ public void setUp() {

@Test
public void testPruneSingleColumnTranslateCodecSingleFile() throws Exception {
testSingleInputFileSetup("GZIP");
ensureContainsGzipFile();
List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -190,7 +209,8 @@ public void testPruneSingleColumnTranslateCodecSingleFile() throws Exception {

@Test
public void testPruneSingleColumnTranslateCodecTwoFiles() throws Exception {
testMultipleInputFilesSetup();
ensureContainsGzipFile();
ensureContainsUncompressedFile();
List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand Down Expand Up @@ -249,7 +269,8 @@ private void testPruneNullifyTranslateCodec(List<Path> inputPaths) throws Except

@Test
public void testPruneNullifyTranslateCodecSingleFile() throws Exception {
testSingleInputFileSetup("GZIP");
ensureContainsGzipFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -260,7 +281,9 @@ public void testPruneNullifyTranslateCodecSingleFile() throws Exception {

@Test
public void testPruneNullifyTranslateCodecTwoFiles() throws Exception {
testMultipleInputFilesSetup();
ensureContainsGzipFile();
ensureContainsUncompressedFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand Down Expand Up @@ -294,7 +317,7 @@ private void testPruneEncryptTranslateCodec(List<Path> inputPaths) throws Except
rewriter.processBlocks();
rewriter.close();

// Verify the schema are not changed for the columns not pruned
// Verify the schema is not changed for the columns not pruned
validateSchema();

// Verify codec has been translated
Expand Down Expand Up @@ -331,7 +354,8 @@ private void testPruneEncryptTranslateCodec(List<Path> inputPaths) throws Except

@Test
public void testPruneEncryptTranslateCodecSingleFile() throws Exception {
testSingleInputFileSetup("GZIP");
ensureContainsGzipFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -342,7 +366,9 @@ public void testPruneEncryptTranslateCodecSingleFile() throws Exception {

@Test
public void testPruneEncryptTranslateCodecTwoFiles() throws Exception {
testMultipleInputFilesSetup();
ensureContainsGzipFile();
ensureContainsUncompressedFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand Down Expand Up @@ -383,7 +409,7 @@ public void testRewriteWithoutColumnIndexes() throws Exception {
rewriter.processBlocks();
rewriter.close();

// Verify the schema are not changed for the columns not pruned
// Verify the schema is not changed for the columns not pruned
ParquetMetadata pmd =
ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
MessageType schema = pmd.getFileMetaData().getSchema();
Expand Down Expand Up @@ -413,7 +439,7 @@ public void testRewriteWithoutColumnIndexes() throws Exception {
assertEquals(inRead.getLong("id", 0), outRead.getLong("id", 0));
assertEquals(inRead.getString("name", 0), outRead.getString("name", 0));

// location was nulled
// location was null
Group finalOutRead = outRead;
assertThrows(
RuntimeException.class,
Expand All @@ -422,7 +448,7 @@ public void testRewriteWithoutColumnIndexes() throws Exception {
RuntimeException.class,
() -> finalOutRead.getGroup("location", 0).getDouble("lon", 0));

// phonenumbers was pruned
// phone numbers was pruned
assertThrows(InvalidRecordException.class, () -> finalOutRead.getGroup("phoneNumbers", 0));
}
}
Expand Down Expand Up @@ -497,7 +523,8 @@ private void testNullifyAndEncryptColumn(List<Path> inputPaths) throws Exception

@Test
public void testNullifyEncryptSingleFile() throws Exception {
testSingleInputFileSetup("GZIP");
ensureContainsGzipFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -508,7 +535,9 @@ public void testNullifyEncryptSingleFile() throws Exception {

@Test
public void testNullifyEncryptTwoFiles() throws Exception {
testMultipleInputFilesSetup();
ensureContainsGzipFile();
ensureContainsUncompressedFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -520,7 +549,8 @@ public void testNullifyEncryptTwoFiles() throws Exception {

@Test
public void testMergeTwoFilesOnly() throws Exception {
testMultipleInputFilesSetup();
ensureContainsGzipFile();
ensureContainsUncompressedFile();

// Only merge two files but do not change anything.
List<Path> inputPaths = new ArrayList<>();
Expand All @@ -534,7 +564,7 @@ public void testMergeTwoFilesOnly() throws Exception {
rewriter.processBlocks();
rewriter.close();

// Verify the schema are not changed
// Verify the schema is not changed
ParquetMetadata pmd =
ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
MessageType schema = pmd.getFileMetaData().getSchema();
Expand Down Expand Up @@ -615,7 +645,8 @@ public void testMergeTwoFilesWithDifferentSchema() throws Exception {

@Test
public void testRewriteFileWithMultipleBlocks() throws Exception {
testSingleInputFileSetup("GZIP", 1024L);
ensureContainsGzipFile();

List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -626,7 +657,7 @@ public void testRewriteFileWithMultipleBlocks() throws Exception {

@Test
public void testPruneSingleColumnTranslateCodecAndEnableBloomFilter() throws Exception {
testSingleInputFileSetupWithBloomFilter("GZIP", "DocId");
testSingleInputFileSetupWithBloomFilter("DocId");
List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -635,14 +666,14 @@ public void testPruneSingleColumnTranslateCodecAndEnableBloomFilter() throws Exc
testPruneSingleColumnTranslateCodec(inputPaths);

// Verify bloom filters
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters(null);
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters();
Map<ColumnPath, List<BloomFilter>> outputBloomFilters = allOutputBloomFilters(null);
assertEquals(inputBloomFilters, outputBloomFilters);
}

@Test
public void testPruneNullifyTranslateCodecAndEnableBloomFilter() throws Exception {
testSingleInputFileSetupWithBloomFilter("GZIP", "DocId", "Links.Forward");
testSingleInputFileSetupWithBloomFilter("DocId", "Links.Forward");
List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -651,7 +682,7 @@ public void testPruneNullifyTranslateCodecAndEnableBloomFilter() throws Exceptio
testPruneNullifyTranslateCodec(inputPaths);

// Verify bloom filters
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters(null);
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters();
assertEquals(inputBloomFilters.size(), 2);
assertTrue(inputBloomFilters.containsKey(ColumnPath.fromDotString("Links.Forward")));
assertTrue(inputBloomFilters.containsKey(ColumnPath.fromDotString("DocId")));
Expand All @@ -666,7 +697,7 @@ public void testPruneNullifyTranslateCodecAndEnableBloomFilter() throws Exceptio

@Test
public void testPruneEncryptTranslateCodecAndEnableBloomFilter() throws Exception {
testSingleInputFileSetupWithBloomFilter("GZIP", "DocId", "Links.Forward");
testSingleInputFileSetupWithBloomFilter("DocId", "Links.Forward");
List<Path> inputPaths = new ArrayList<Path>() {
{
add(new Path(inputFiles.get(0).getFileName()));
Expand All @@ -675,7 +706,7 @@ public void testPruneEncryptTranslateCodecAndEnableBloomFilter() throws Exceptio
testPruneEncryptTranslateCodec(inputPaths);

// Verify bloom filters
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters(null);
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = allInputBloomFilters();

// Cannot read without FileDecryptionProperties
assertThrows(ParquetCryptoRuntimeException.class, () -> allOutputBloomFilters(null));
Expand All @@ -685,42 +716,19 @@ public void testPruneEncryptTranslateCodecAndEnableBloomFilter() throws Exceptio
assertEquals(inputBloomFilters, outputBloomFilters);
}

private void testSingleInputFileSetup(String compression) throws IOException {
testSingleInputFileSetup(compression, ParquetWriter.DEFAULT_BLOCK_SIZE);
}

private void testSingleInputFileSetupWithBloomFilter(String compression, String... bloomFilterEnabledColumns)
throws IOException {
testSingleInputFileSetup(compression, ParquetWriter.DEFAULT_BLOCK_SIZE, bloomFilterEnabledColumns);
private void testSingleInputFileSetupWithBloomFilter(String... bloomFilterEnabledColumns) throws IOException {
testSingleInputFileSetup(bloomFilterEnabledColumns);
}

private void testSingleInputFileSetup(String compression, long rowGroupSize, String... bloomFilterEnabledColumns)
throws IOException {
MessageType schema = createSchema();
inputFiles = Lists.newArrayList();
inputFiles.add(new TestFileBuilder(conf, schema)
.withNumRecord(numRecord)
.withCodec(compression)
.withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE)
.withRowGroupSize(rowGroupSize)
.withBloomFilterEnabled(bloomFilterEnabledColumns)
.withWriterVersion(writerVersion)
.build());
}

private void testMultipleInputFilesSetup() throws IOException {
private void testSingleInputFileSetup(String... bloomFilterEnabledColumns) throws IOException {
MessageType schema = createSchema();
inputFiles = Lists.newArrayList();
inputFiles.add(new TestFileBuilder(conf, schema)
.withNumRecord(numRecord)
.withCodec("GZIP")
.withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE)
.withWriterVersion(writerVersion)
.build());
inputFiles.add(new TestFileBuilder(conf, schema)
.withNumRecord(numRecord)
.withCodec("UNCOMPRESSED")
.withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE)
.withRowGroupSize(ParquetWriter.DEFAULT_BLOCK_SIZE)
.withBloomFilterEnabled(bloomFilterEnabledColumns)
.withWriterVersion(writerVersion)
.build());
}
Expand Down Expand Up @@ -748,7 +756,7 @@ private void validateColumnData(
.withDecryption(fileDecryptionProperties)
.build();

// Get total number of rows from input files
// Get the total number of rows from input files
int totalRows = 0;
for (EncryptionTestFile inputFile : inputFiles) {
totalRows += inputFile.getFileContent().length;
Expand Down Expand Up @@ -821,7 +829,7 @@ private ParquetMetadata getFileMetaData(String file, FileDecryptionProperties fi
ParquetReadOptions readOptions = ParquetReadOptions.builder()
.withDecryption(fileDecryptionProperties)
.build();
ParquetMetadata pmd = null;
ParquetMetadata pmd;
InputFile inputFile = HadoopInputFile.fromPath(new Path(file), conf);
try (SeekableInputStream in = inputFile.newStream()) {
pmd = ParquetFileReader.readFooter(inputFile, readOptions, in);
Expand Down Expand Up @@ -995,12 +1003,10 @@ private void validateRowGroupRowCount() throws Exception {
assertEquals(inputRowCounts, outputRowCounts);
}

private Map<ColumnPath, List<BloomFilter>> allInputBloomFilters(FileDecryptionProperties fileDecryptionProperties)
throws Exception {
private Map<ColumnPath, List<BloomFilter>> allInputBloomFilters() throws Exception {
Map<ColumnPath, List<BloomFilter>> inputBloomFilters = new HashMap<>();
for (EncryptionTestFile inputFile : inputFiles) {
Map<ColumnPath, List<BloomFilter>> bloomFilters =
allBloomFilters(inputFile.getFileName(), fileDecryptionProperties);
Map<ColumnPath, List<BloomFilter>> bloomFilters = allBloomFilters(inputFile.getFileName(), null);
for (Map.Entry<ColumnPath, List<BloomFilter>> entry : bloomFilters.entrySet()) {
List<BloomFilter> bloomFilterList = inputBloomFilters.getOrDefault(entry.getKey(), new ArrayList<>());
bloomFilterList.addAll(entry.getValue());
Expand Down Expand Up @@ -1072,4 +1078,16 @@ private void validateSchema() throws IOException {
assertEquals(subFields.get(0).getName(), "Backward");
assertEquals(subFields.get(1).getName(), "Forward");
}

private void ensureContainsGzipFile() {
if (!inputFiles.contains(gzipEncryptionTestFileWithoutBloomFilterColumn)) {
inputFiles.add(this.gzipEncryptionTestFileWithoutBloomFilterColumn);
}
}

private void ensureContainsUncompressedFile() {
if (!inputFiles.contains(uncompressedEncryptionTestFileWithoutBloomFilterColumn)) {
inputFiles.add(uncompressedEncryptionTestFileWithoutBloomFilterColumn);
}
}
}
Loading