0% found this document useful (0 votes)
22 views4 pages

XX

This Python script extracts data from SQL INSERT statements into CSV files, loads the CSV data into a PostgreSQL database, and handles splitting the extraction into multiple CSV files if the row limit is exceeded.

Uploaded by

Muhdhanafi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views4 pages

XX

This Python script extracts data from SQL INSERT statements into CSV files, loads the CSV data into a PostgreSQL database, and handles splitting the extraction into multiple CSV files if the row limit is exceeded.

Uploaded by

Muhdhanafi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

import re

import csv
import os
import psycopg2

def extract_data_to_csv(input_file, output_folder, max_rows_per_file, db_params):


# Define the regular expression pattern to extract values from INSERT
statements
pattern = re.compile(r"INSERT\s+\[dbo\]\.\[Mobile\]\s+\(\[Id\],\s+\[IC\],\s+\
[Name\],\s+\[Mobile_Number\],\s+\[Address1\],\s+\[Address2\],\s+\[State_Name\],\s+\
[Data_Source_Id\],\s+\[Data_Filename\],\s+\[WA_Status\]\)\s+VALUES\s+\((\d+),\
s+N'([^']+)',\s+N'([^']+)',\s+N'([^']+)',\s+N'([^']*)',\s+(?:NULL|N'([^']*)'),\
s+N'([^']+)',\s+(\d+),\s+N'([^']+)',\s+(\d+)\)")

# Create the output folder if it doesn't exist


if not os.path.exists(output_folder):
os.makedirs(output_folder)

# Initialize connection to the database


conn = psycopg2.connect(**db_params)
cur = conn.cursor()

# Initialize CSV index


csv_index = 1

# Initialize row count


row_count = 0

# Initialize set to keep track of processed files


processed_files = set()

# Open the input SQL file for reading


with open(input_file, 'r', encoding='utf-16', errors='ignore') as f_in:
# Initialize CSV writer
csv_writer = None

# Initialize the current CSV file


output_file = None

# Iterate over lines in the input SQL file


for line in f_in:
# Skip lines containing null bytes
if '\x00' in line:
continue

# Match the pattern in each line


match = pattern.match(line)
#print("Matched Line:", line) # Add this print statement to see the
matched line
if match:
# Extract values from the matched groups
values = match.groups()
print("Extracted Values:", values) # Add this print statement to
see extracted values
# Remove double quotes from extracted values except for Address1
values = [value.strip('"') if value is not None and idx != 4 else
value for idx, value in enumerate(values)]
# Replace empty values with None
values = [None if value == "''" else value for value in values] #
Modify this line

# Generate the filename based on the sequential index if not


created yet
if output_file is None:
output_file = os.path.join(output_folder, f"{csv_index}.csv")

# Open a new CSV file for writing


csv_file = open(output_file, 'a', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
# Write header to the first file
if row_count == 0:
csv_writer.writerow(['Id', 'IC', 'Name', 'Mobile_Number',
'Address1', 'Address2', 'State_Name', 'Data_Source_Id', 'Data_Filename',
'WA_Status'])

# Write the extracted values to the CSV file


csv_writer.writerow(values)
row_count += 1

# Open a new CSV file if the row count exceeds the maximum rows per
file
if row_count >= max_rows_per_file:
# Close the current CSV file
csv_file.close() # Close the file object directly

# Load data into database using COPY command


if output_file not in processed_files:
copy_csv_to_postgres(output_file, 'bank', cur)
print(f"Data from {output_file} copied to database.")
# Add the file to the set of processed files
processed_files.add(output_file)
else:
print(f"Skipping file {output_file} as it's already
processed.")

# Increment the CSV index for the next file


csv_index += 1

# Reset the row count


row_count = 0

# Reset the current CSV file


output_file = None

# Close the last CSV file if it exists


if csv_writer is not None:
csv_file.close() # Close the file object directly
# Load data into database using COPY command
if output_file not in processed_files:
copy_csv_to_postgres(output_file, 'bank', cur)
print(f"Data from {output_file} copied to database.")
# Add the file to the set of processed files
processed_files.add(output_file)
else:
print(f"Skipping file {output_file} as it's already processed.")

# Commit changes and close database connection


conn.commit()
cur.close()
conn.close()

print("Data extraction to CSV and insertion into database completed.")

def copy_csv_to_postgres(csv_file_path, table_name, cursor):


"""Loads CSV data into PostgreSQL table using COPY command."""
try:
with open(csv_file_path, 'r', newline='', encoding='utf-8') as csv_file:
# Manually specify column names based on table schema
columns = ['Id', 'IC', 'Name', 'Mobile_Number', 'Address1', 'Address2',
'State_Name', 'Data_Source_Id', 'Data_Filename', 'WA_Status']

# Skip the header row


next(csv_file)

# Copy data from the CSV file to the PostgreSQL table


for line in csv_file:
# Split the line into values while considering the possibility of
commas within quotes
values = csv.reader([line]).__next__()

# Remove the first and last double quotes, if present, from each
value
values = [value.strip('"') for value in values]
# Replace double quotes within values with two double quotes to
escape them
values = [value.replace('"', '""') for value in values]

# Check if any value is an empty string, if so, replace it with


None
values = [None if value == '' else value for value in values]

# Write the values to the database


cursor.execute(
f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({',
'.join(['%s']*len(columns))})",
values
)

print(f"Data from {csv_file_path} copied to {table_name} table.")

except Exception as e:
print(f"Failed to insert data into {table_name} table from {csv_file_path}.
Error: {str(e)}")

# Example usage:
input_file = r"D:\Backup\Script MobileDB - Data\dbo.Mobile.Table.sql"
output_folder = r"D:\Backup\CSV"
max_rows_per_file = 1048576 # Example maximum rows per file
db_params = {
'dbname': 'mobiledb',
'user': 'postgres',
'password': 'supostgres',
'host': 'localhost',
'port': '5432'
}
extract_data_to_csv(input_file, output_folder, max_rows_per_file, db_params)

You might also like