script
script
take a JSON or CSV file as input, where each row (or JSON entry) represents a table with its
corresponding srcdb, tgtdb, and partition details.
---
Approach
1. Use a JSON or CSV file as input, containing multiple srcdb, tgtdb, and table partition
details.
---
Updated Implementation
[
{
"srcdb": "source_db1",
"tgtdb": "target_db1",
"csv_file": "dbfs:/mnt/input/table1.csv"
},
{
"srcdb": "source_db2",
"tgtdb": "target_db2",
"csv_file": "dbfs:/mnt/input/table2.csv"
}
]
Each entry represents one table's operation with a corresponding CSV file.
---
# Constants
SIZE_MULTIPLIER = {"kb": 1 / (1024 * 1024), "mb": 1 / 1024, "gb": 1}
CHUNK_SIZE_GB = 1 # 1GB chunk size
def parse_size(size_str):
"""Convert file size string (e.g., '500mb', '2gb') to GB."""
match = re.match(r"([\d.]+)(kb|mb|gb)", size_str.lower())
if match:
size, unit = match.groups()
return float(size) * SIZE_MULTIPLIER[unit]
return 0 # Default if no match
def extract_partition_info(partition_str):
"""Extract table name, cluster-by column, and value from partition string."""
match = re.search(r"\/([^\/]+)\/([^=]+)=([\d\/]+)", partition_str)
if match:
table_name, clusterby_column, clusterby_value = match.groups()
return table_name, clusterby_column, clusterby_value
return None, None, None
# Convert file_size to GB
df["file_size_gb"] = df["file_size"].apply(parse_size)
queries = []
unique_request_id = str(uuid.uuid4())[:8] # Unique base request ID
for index, row in df.iterrows():
table_name, clusterby_column, clusterby_value = extract_partition_info(row["partition"])
total_size = row["file_size_gb"]
num_chunks = max(1, int(total_size // CHUNK_SIZE_GB) + (1 if total_size %
CHUNK_SIZE_GB > 0 else 0))
query = f"""
INSERT INTO {tgtdb}.{table_name} (srcdb, tgtdb, table, requestid, condition)
VALUES ('{srcdb}', '{tgtdb}', '{table_name}', '{request_id_chunk}',
'{clusterby_column} >= \"{chunk_start}\" AND {clusterby_column} <=
\"{chunk_end}\"');
"""
queries.append(query.strip())
return queries
---
Set:
json_file → dbfs:/mnt/input/tables.json
requestid → request123
---
4. Expected Output
---
---
Final Thoughts
Scalability: You can process multiple tables with different databases dynamically.