diff options
author | Christoph Cullmann <[email protected]> | 2024-06-11 18:11:36 +0200 |
---|---|---|
committer | Christoph Cullmann <[email protected]> | 2024-06-18 15:55:33 +0000 |
commit | 607b3b2feb1328fdf8cf01768276d615c110e304 (patch) | |
tree | dc84f9dd6b6bc967b001196c6a5d5b66b8e48887 | |
parent | a8b7da59cba56b535393f50cd7432a412021d8d2 (diff) |
rcc: de-duplicate data in resources
content based de-duplications by SHA256 hashing with full data
check if candidates based on the hash value are found
Task-number: QTBUG-126168
Change-Id: Ifebc8ca322e354d8ea1f701f27f3f65916f7555c
Reviewed-by: hjk <[email protected]>
-rw-r--r-- | src/tools/rcc/rcc.cpp | 71 | ||||
-rw-r--r-- | tests/auto/tools/rcc/data/deduplication/deduplication.expected | 157 | ||||
-rw-r--r-- | tests/auto/tools/rcc/data/deduplication/deduplication.qrc | 10 | ||||
-rw-r--r-- | tests/auto/tools/rcc/data/deduplication/files/a.txt | 1 | ||||
-rw-r--r-- | tests/auto/tools/rcc/data/deduplication/files/b.txt | 1 | ||||
-rw-r--r-- | tests/auto/tools/rcc/data/deduplication/files/c_with_a_content.txt | 1 | ||||
-rw-r--r-- | tests/auto/tools/rcc/tst_rcc.cpp | 5 |
7 files changed, 238 insertions, 8 deletions
diff --git a/src/tools/rcc/rcc.cpp b/src/tools/rcc/rcc.cpp index a1089914fd2..06f9ae1015c 100644 --- a/src/tools/rcc/rcc.cpp +++ b/src/tools/rcc/rcc.cpp @@ -1,10 +1,12 @@ // Copyright (C) 2018 The Qt Company Ltd. // Copyright (C) 2018 Intel Corporation. +// Copyright (C) 2024 Christoph Cullmann <[email protected]> // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 #include "rcc.h" #include <qbytearray.h> +#include <qcryptographichash.h> #include <qdatetime.h> #include <qdebug.h> #include <qdir.h> @@ -90,8 +92,28 @@ public: QString resourceName() const; + struct DeduplicationKey { + RCCResourceLibrary::CompressionAlgorithm compressAlgo; + int compressLevel; + int compressThreshold; + QByteArray hash; + + bool operator==(const DeduplicationKey &other) const + { + return compressAlgo == other.compressAlgo && + compressLevel == other.compressLevel && + compressThreshold == other.compressThreshold && + hash == other.hash; + } + }; + + typedef QMultiHash<DeduplicationKey, RCCFileInfo*> DeduplicationMultiHash; + public: - qint64 writeDataBlob(RCCResourceLibrary &lib, qint64 offset, QString *errorMessage); + qint64 writeDataBlob(RCCResourceLibrary &lib, + qint64 offset, + DeduplicationMultiHash &dedupByContent, + QString *errorMessage); qint64 writeDataName(RCCResourceLibrary &, qint64 offset); void writeDataInfo(RCCResourceLibrary &lib); @@ -114,6 +136,11 @@ public: qint64 m_childOffset = 0; }; +static size_t qHash(const RCCFileInfo::DeduplicationKey &key, size_t seed) noexcept +{ + return qHashMulti(seed, key.compressAlgo, key.compressLevel, key.compressThreshold, key.hash); +} + RCCFileInfo::RCCFileInfo(const QString &name, const QFileInfo &fileInfo, QLocale::Language language, QLocale::Territory territory, uint flags, RCCResourceLibrary::CompressionAlgorithm compressAlgo, int compressLevel, @@ -217,8 +244,10 @@ void RCCFileInfo::writeDataInfo(RCCResourceLibrary &lib) } } -qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib, qint64 offset, - QString *errorMessage) +qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib, + qint64 offset, + DeduplicationMultiHash &dedupByContent, + QString *errorMessage) { const bool text = lib.m_format == RCCResourceLibrary::C_Code; const bool pass1 = lib.m_format == RCCResourceLibrary::Pass1; @@ -231,14 +260,38 @@ qint64 RCCFileInfo::writeDataBlob(RCCResourceLibrary &lib, qint64 offset, QByteArray data; if (!m_isEmpty) { - //find the data to be written - QFile file(m_fileInfo.absoluteFilePath()); + // find the data to be written + const QString absoluteFilePath = m_fileInfo.absoluteFilePath(); + QFile file(absoluteFilePath); if (!file.open(QFile::ReadOnly)) { - *errorMessage = msgOpenReadFailed(m_fileInfo.absoluteFilePath(), file.errorString()); + *errorMessage = msgOpenReadFailed(absoluteFilePath, file.errorString()); return 0; } - data = file.readAll(); + + // de-duplicate the same file content, we can re-use already written data + // we only do that if we have the same compression settings + const QByteArray hash = QCryptographicHash::hash(data, QCryptographicHash::Sha256); + const DeduplicationKey key{m_compressAlgo, m_compressLevel, m_compressThreshold, hash}; + const QList<RCCFileInfo *> potentialCandidates = dedupByContent.values(key); + for (const RCCFileInfo *candidate : potentialCandidates) { + // check real content, we can have collisions + QFile candidateFile(candidate->m_fileInfo.absoluteFilePath()); + if (!candidateFile.open(QFile::ReadOnly)) { + *errorMessage = msgOpenReadFailed(candidate->m_fileInfo.absoluteFilePath(), + candidateFile.errorString()); + return 0; + } + if (data != candidateFile.readAll()) { + continue; + } + // just remember the offset & flags with final compression state + // of the already written data and be done + m_dataOffset = candidate->m_dataOffset; + m_flags = candidate->m_flags; + return offset; + } + dedupByContent.insert(key, this); } // Check if compression is useful for this file @@ -1168,6 +1221,7 @@ bool RCCResourceLibrary::writeDataBlobs() QStack<RCCFileInfo*> pending; pending.push(m_root); qint64 offset = 0; + RCCFileInfo::DeduplicationMultiHash dedupByContent; QString errorMessage; while (!pending.isEmpty()) { RCCFileInfo *file = pending.pop(); @@ -1176,7 +1230,8 @@ bool RCCResourceLibrary::writeDataBlobs() if (child->m_flags & RCCFileInfo::Directory) pending.push(child); else { - offset = child->writeDataBlob(*this, offset, &errorMessage); + offset = child->writeDataBlob(*this, offset, + dedupByContent, &errorMessage); if (offset == 0) { m_errorDevice->write(errorMessage.toUtf8()); return false; diff --git a/tests/auto/tools/rcc/data/deduplication/deduplication.expected b/tests/auto/tools/rcc/data/deduplication/deduplication.expected new file mode 100644 index 00000000000..bd873437b46 --- /dev/null +++ b/tests/auto/tools/rcc/data/deduplication/deduplication.expected @@ -0,0 +1,157 @@ +/**************************************************************************** +** Resource object code +** +IGNORE:** Created by: The Resource Compiler for Qt version 6.9.0 +** +** WARNING! All changes made in this file will be lost! +*****************************************************************************/ + +#ifdef _MSC_VER +// disable informational message "function ... selected for automatic inline expansion" +#pragma warning (disable: 4711) +#endif + +static const unsigned char qt_resource_data[] = { + // b.txt + 0x0,0x0,0x0,0xb, + 0x62, + 0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65, + // c_with_a_content.txt + 0x0,0x0,0x0,0xb, + 0x61, + 0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65, + // b.txt + 0x0,0x0,0x0,0xb, + 0x62, + 0x20,0x74,0x65,0x73,0x74,0x20,0x66,0x69,0x6c,0x65, + +}; + +static const unsigned char qt_resource_name[] = { + // files + 0x0,0x5, + 0x0,0x6d,0x2,0xc3, + 0x0,0x66, + 0x0,0x69,0x0,0x6c,0x0,0x65,0x0,0x73, + // b.txt + 0x0,0x5, + 0x0,0x65,0x5b,0xf4, + 0x0,0x62, + 0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74, + // c_with_a_content.txt + 0x0,0x14, + 0x1,0x61,0x1d,0x34, + 0x0,0x63, + 0x0,0x5f,0x0,0x77,0x0,0x69,0x0,0x74,0x0,0x68,0x0,0x5f,0x0,0x61,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6e,0x0,0x74,0x0,0x65,0x0,0x6e,0x0,0x74,0x0,0x2e, + 0x0,0x74,0x0,0x78,0x0,0x74, + // a.txt + 0x0,0x5, + 0x0,0x64,0x5b,0xf4, + 0x0,0x61, + 0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74, + // alias_of_b_compress9.txt + 0x0,0x18, + 0xb,0x26,0xf,0xb4, + 0x0,0x61, + 0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6d,0x0,0x70,0x0,0x72,0x0,0x65, + 0x0,0x73,0x0,0x73,0x0,0x39,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74, + // alias_of_b.txt + 0x0,0xe, + 0x1,0xa4,0x6d,0x34, + 0x0,0x61, + 0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74, + // alias_of_b_compress9_dupe.txt + 0x0,0x1d, + 0x9,0x4,0x7a,0x14, + 0x0,0x61, + 0x0,0x6c,0x0,0x69,0x0,0x61,0x0,0x73,0x0,0x5f,0x0,0x6f,0x0,0x66,0x0,0x5f,0x0,0x62,0x0,0x5f,0x0,0x63,0x0,0x6f,0x0,0x6d,0x0,0x70,0x0,0x72,0x0,0x65, + 0x0,0x73,0x0,0x73,0x0,0x39,0x0,0x5f,0x0,0x64,0x0,0x75,0x0,0x70,0x0,0x65,0x0,0x2e,0x0,0x74,0x0,0x78,0x0,0x74, + +}; + +static const unsigned char qt_resource_struct[] = { + // : + 0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1, +0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0, + // :/files + 0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x0,0x0,0x6,0x0,0x0,0x0,0x2, +0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0, + // :/files/a.txt + 0x0,0x0,0x0,0x4e,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0xf, +TIMESTAMP:files/a.txt + // :/files/b.txt + 0x0,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0, +TIMESTAMP:files/b.txt + // :/files/c_with_a_content.txt + 0x0,0x0,0x0,0x20,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0xf, +TIMESTAMP:files/c_with_a_content.txt + // :/files/alias_of_b.txt + 0x0,0x0,0x0,0x94,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x0, +TIMESTAMP:files/b.txt + // :/files/alias_of_b_compress9_dupe.txt + 0x0,0x0,0x0,0xb6,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1e, +TIMESTAMP:files/b.txt + // :/files/alias_of_b_compress9.txt + 0x0,0x0,0x0,0x5e,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x1e, +TIMESTAMP:files/b.txt + +}; + +#ifdef QT_NAMESPACE +# define QT_RCC_PREPEND_NAMESPACE(name) ::QT_NAMESPACE::name +# define QT_RCC_MANGLE_NAMESPACE0(x) x +# define QT_RCC_MANGLE_NAMESPACE1(a, b) a##_##b +# define QT_RCC_MANGLE_NAMESPACE2(a, b) QT_RCC_MANGLE_NAMESPACE1(a,b) +# define QT_RCC_MANGLE_NAMESPACE(name) QT_RCC_MANGLE_NAMESPACE2( \ + QT_RCC_MANGLE_NAMESPACE0(name), QT_RCC_MANGLE_NAMESPACE0(QT_NAMESPACE)) +#else +# define QT_RCC_PREPEND_NAMESPACE(name) name +# define QT_RCC_MANGLE_NAMESPACE(name) name +#endif + +#if defined(QT_INLINE_NAMESPACE) +inline namespace QT_NAMESPACE { +#elif defined(QT_NAMESPACE) +namespace QT_NAMESPACE { +#endif + +bool qRegisterResourceData(int, const unsigned char *, const unsigned char *, const unsigned char *); +bool qUnregisterResourceData(int, const unsigned char *, const unsigned char *, const unsigned char *); + +#ifdef QT_NAMESPACE +} +#endif + +int QT_RCC_MANGLE_NAMESPACE(qInitResources)(); +int QT_RCC_MANGLE_NAMESPACE(qInitResources)() +{ + int version = 3; + QT_RCC_PREPEND_NAMESPACE(qRegisterResourceData) + (version, qt_resource_struct, qt_resource_name, qt_resource_data); + return 1; +} + +int QT_RCC_MANGLE_NAMESPACE(qCleanupResources)(); +int QT_RCC_MANGLE_NAMESPACE(qCleanupResources)() +{ + int version = 3; + QT_RCC_PREPEND_NAMESPACE(qUnregisterResourceData) + (version, qt_resource_struct, qt_resource_name, qt_resource_data); + return 1; +} + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif + +namespace { + struct initializer { + initializer() { QT_RCC_MANGLE_NAMESPACE(qInitResources)(); } + ~initializer() { QT_RCC_MANGLE_NAMESPACE(qCleanupResources)(); } + } dummy; +} + +#ifdef __clang__ +# pragma clang diagnostic pop +#endif diff --git a/tests/auto/tools/rcc/data/deduplication/deduplication.qrc b/tests/auto/tools/rcc/data/deduplication/deduplication.qrc new file mode 100644 index 00000000000..fd8a776503e --- /dev/null +++ b/tests/auto/tools/rcc/data/deduplication/deduplication.qrc @@ -0,0 +1,10 @@ +<!DOCTYPE RCC><RCC version="1.0"> +<qresource> + <file>files/a.txt</file> + <file>files/b.txt</file> + <file alias="files/alias_of_b.txt">files/b.txt</file> + <file>files/c_with_a_content.txt</file> + <file alias="files/alias_of_b_compress9.txt" compress="9">files/b.txt</file> + <file alias="files/alias_of_b_compress9_dupe.txt" compress="9">files/b.txt</file> +</qresource> +</RCC> diff --git a/tests/auto/tools/rcc/data/deduplication/files/a.txt b/tests/auto/tools/rcc/data/deduplication/files/a.txt new file mode 100644 index 00000000000..abd91bd4652 --- /dev/null +++ b/tests/auto/tools/rcc/data/deduplication/files/a.txt @@ -0,0 +1 @@ +a test file
\ No newline at end of file diff --git a/tests/auto/tools/rcc/data/deduplication/files/b.txt b/tests/auto/tools/rcc/data/deduplication/files/b.txt new file mode 100644 index 00000000000..01e4d76fc57 --- /dev/null +++ b/tests/auto/tools/rcc/data/deduplication/files/b.txt @@ -0,0 +1 @@ +b test file
\ No newline at end of file diff --git a/tests/auto/tools/rcc/data/deduplication/files/c_with_a_content.txt b/tests/auto/tools/rcc/data/deduplication/files/c_with_a_content.txt new file mode 100644 index 00000000000..abd91bd4652 --- /dev/null +++ b/tests/auto/tools/rcc/data/deduplication/files/c_with_a_content.txt @@ -0,0 +1 @@ +a test file
\ No newline at end of file diff --git a/tests/auto/tools/rcc/tst_rcc.cpp b/tests/auto/tools/rcc/tst_rcc.cpp index af4a992d5cf..ac024b11d0f 100644 --- a/tests/auto/tools/rcc/tst_rcc.cpp +++ b/tests/auto/tools/rcc/tst_rcc.cpp @@ -152,6 +152,11 @@ void tst_rcc::rcc_data() QTest::newRow("legal") << m_dataPath + QLatin1StringView("/legal") << "legal.qrc" << "rcc_legal.cpp"; + + if (sizeof(size_t) == 8) { + const QString deduplicationPath = m_dataPath + QLatin1String("/deduplication"); + QTest::newRow("deduplication") << deduplicationPath << "deduplication.qrc" << "deduplication.expected"; + } } static QStringList readLinesFromFile(const QString &fileName, |