Hdfs To s3
Hdfs To s3
#!/bin/bash
#!/usr/bin/env bash
###################################################################################
##############
# created : 2021-01-10
#
# Purpose : Script to copy data from HDFS to S3
#
# Arguments reqd : 4 arguments are required for this job to run.
# 1st : The properties file for the S3/BDA
configuration
# 2nd : TableList with the list of tables that
need to me moved along with Primary partition details.
# 3rd (optional) : fRunDate in YYYY-MM-DD format.
# 4th (optional) : tRunDate in YYYY-MM-DD format.
#
#
#
# Examples : sh hdfs_to_S3.sh /conf/properties/aws-us-east-
1.properties /conf/tableList/smis_hdfs_to_s3.config 2021-01-10 2021-01-10
#
# Version History : 1.0
#
###################################################################################
##############
echo $props
createFilterFile() {
fromDate=$1
toDate=$2
partName=$3
dateFormat=$4
if [ "$fromDate" != "$toDate" ]
then
while ! [[ $fromDate > $toDate ]]
do
parsedDate=`date -d "$fromDate" +$dateFormat`
echo "$partName=$parsedDate" >> "$filterFileNm"
fromDate=$(date -d "$fromDate + 1 day" +%Y-%m-%d)
done
else
parsedDate=`date -d "$fromDate" +$dateFormat`
echo "$partName=$parsedDate" >> "$filterFileNm"
fi
}
cleantmp() {
`rm $filterFileNm`
}
sendMail "Started" "$scriptName Started loading data to S3 Bucket: $bucket"
fi
$(hdfs dfs -mkdir $hdfsReconFile`date +%Y-%m-%d`/)
if [ $? -ne 0 ]; then
logIt "INFO" "Recon path already exists " "" "" "" 0
fi
echo "INFO :------------------ HDFS Log and Recon paths created ------------------
"
echo ${reset}
echo""
if [ -z "$AWS_ACCESS_KEY_ID" ]
then
logIt "INFO" "AWS_ACCESS_KEY_ID not set" "" "" "" 0
read -s -p "s3 access key :" access_key
export AWS_ACCESS_KEY_ID=$access_key
else
logIt "INFO" "AWS_ACCESS_KEY_ID set" "" "" "" 0
fi
if [ -z "$AWS_SECRET_ACCESS_KEY" ]
then
logIt "INFO" "AWS_SECRET_ACCESS_KEY not set" "" "" "" 0
read -s -p "s3 secret key :" secret_key
export AWS_SECRET_ACCESS_KEY=$secret_key
else
logIt "INFO" "AWS_SECRET_ACCESS_KEY set" "" "" "" 0
fi
echo ${reset}
echo""
`rm $filterFileNm`
createFilterFile "$fromRunDate" "$toRunDate" "$partition" "%Y%m%d"
else
echo "ERROR : Wrong Partition Name $partition"
logIt "INFO" "Wrong Partition Name $partition " "" "" "" 0
exit 1
fi
if [ "$srcPath" != "" ]
then
IFS=$'\n'
for sqlList in "${uniqueSrcPath[@]}"
do
if [ "NA" != "$partition" ]
then
# echo $sqlList
# echo $partition
# echo $tablename
path1=${sqlList##*$partition}
path2=${path1%%/*}
btwPath1=${sqlList##*$srcTbleName}
btwPath2=${btwPath1%%$partition*}
partitionname=`echo $partition$path2`
# echo $btwPath2
# echo $partitionname
# echo ${sqlList%%$partition*}$partitionname
parsedSrcPath=$sqlList
dest=`echo
"$bucket/$srcSchemaName/$srcTbleName/$partitionname"`
else
parsedSrcPath=$sqlList
dest=`echo "$bucket/$srcSchemaName/$srcTbleName"`
fi
source=${parsedSrcPath//[[:space:]]}
dest=${dest//[[:space:]]}
echo "INFO :
$server_side_encryption_algorithm=$server_side_encryption_algorithm"
logIt "INFO" "$server_side_encryption_algorithm" ""
"$server_side_encryption_algorithm" "" 0
echo "INFO : $s3_endpoint=$s3_endpoint"
logIt "INFO" "$s3_endpoint" "" "$s3_endpoint" "" 0
echo "INFO : $s3a_fast_upload=$s3a_fast_upload"
logIt "INFO" "$s3a_fast_upload" "" "$s3a_fast_upload" "" 0
if [ -z "$AWS_ACCESS_KEY_ID" ]
then
logIt "INFO" "AWS_ACCESS_KEY_ID not set" "" "" "" 0
read -s -p "s3 access key :" access_key
export AWS_ACCESS_KEY_ID=$access_key
else
logIt "INFO" "AWS_ACCESS_KEY_ID set again" "" "" ""
0
fi
if [ -z "$AWS_SECRET_ACCESS_KEY" ]
then
logIt "INFO" "AWS_SECRET_ACCESS_KEY not set" "" ""
"" 0
read -s -p "s3 secret key :" secret_key
export AWS_SECRET_ACCESS_KEY=$secret_key
else
logIt "INFO" "AWS_SECRET_ACCESS_KEY set again" ""
"" "" 0
fi
fi
hadoop distcp -Dmapreduce.job.queuename=$yarn_queue \
-Dfs.s3a.server-side-encryption-
algorithm=$server_side_encryption_algorithm \
-Dfs.s3a.endpoint=$s3_endpoint \
-Dfs.s3a.fast.upload=$s3a_fast_upload \
-Dfs.s3a.buffer.dir=$s3a_buffer_dir \
-Dfs.s3a.multipart.uploads.enabled=$multipart_uploads_enabled
\
-Dfs.s3a.access.key=$AWS_ACCESS_KEY_ID \
-Dfs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY \
-
Dfs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsPr
ovider \
-Dfs.s3a.session.token=$AWS_SESSION_TOKEN \
-Dfs.s3a.proxy.host=$s3a_proxy_host \
-Dfs.s3a.proxy.port=$s3a_proxy_port \
-m 150 \
-update \
-skipcrccheck \
-numListstatusThreads 40 \
$source s3a://$dest 2>/dev/null
if [ $? -ne 0 ]; then
logIt "ERROR" "BDA to S3 Upload Distcp Command Failed" ""
"$tablename" "" 1
sendMail "Failed" "BDA to S3 Upload Distcp Command Failed
==> $tablename"
cleantmp
exit 1
else
logIt "INFO" "BDA to S3 Upload Finished Successfully. " ""
"$tablename" "" 0
fi
echo "INFO : BDA to S3 Upload Finished...for table $tablename
and partition : $partitionname"
done
logIt "INFO" "POST BDA Processing Started... " "" "$tablename"
"" 0
rulesEngine=$(getPrePostRules)
IFS=$'\n'
rulesEngine=(`echo "$rulesEngine" | sed "s/[|]$/\\n/g"`)
for rule in "${rulesEngine[@]}"
do
load_date=$(date +%Y-%m-%d)
runTime=$(date +%Y%m%d%H%M%S)
#echo "$sql"
reconSyncFlag="1"
IFS=$'\n'
rows=(`echo "$rows" | sed "s/[|]$/\\n/g"`)
if [ "NA" != "$partition" ]
then
for row in "${rows[@]}"
do
# echo "Row===> $row"
bda_partition=`echo $row | cut -f2 -d"|"`
bda_partition_val=`echo $row | cut -f3 -d"|"`
for ((i=3;i<colCnt;i++))
do
val=$(( i + 1 ))
Matrix_name=`echo $hdr
| cut -f$val -d"|"`
Matrix_value=`echo $row
| cut -f$val -d"|" | tr -d " "`
# echo "Value ==>
$Matrix_value"
len=`echo $Matrix_value
| tr -d " "| wc -m`
# echo " Length ==>
$len"
if [ $Matrix_value =
"0E-18" ]
then
Matrix_value="0.000000000000000000"
fi
# echo "Matrix_name==>
`echo $hdr | cut -f$val -d"|"` Matrix_value ==> `echo $rows | cut -f$val -d"|"`"
# logIt "DEBUG"
"Matrix_name==> ${Matrix_name//[[:space:]]}" "" "${Matrix_value//[[:space:]]}" "" 0
# echo "|$bda_partition|
$bda_partition_val|${Matrix_name//[[:space:]]}|${Matrix_value//[[:space:]]}"
printf "$runTime|
$tablename|$bda_partition|$bda_partition_val|${Matrix_name//[[:space:]]}|$
{Matrix_value//[[:space:]]}|$load_date\n" | hdfs dfs -appendToFile - $appReconFile
done
done
else
bda_partition=""
bda_partition_val=""
for ((i=1;i<colCnt;i++))
do
val=$(( i + 1 ))
Matrix_name=`echo $hdr | cut -f$val -d"|"`
Matrix_value=`echo $rows | cut -f$val -d"|"`
# echo "Matrix_name==> `echo $hdr | cut -f$val -
d"|"` Matrix_value ==> `echo $rows | cut -f$val -d"|"`"
logIt "DEBUG" "Matrix_name==>
${Matrix_name//[[:space:]]}" "" "${Matrix_value//[[:space:]]}" "" 0
printf "$runTime|$tablename|$bda_partition|
$bda_partition_val|${Matrix_name//[[:space:]]}|${Matrix_value//[[:space:]]}|
$load_date\n" | hdfs dfs -appendToFile - $appReconFile
done
fi
done
logIt "INFO" "POST BDA Processing Finished. " "" "$tablename" "" 0
else
echo "INFO : ------------------No data for table ==> $tablename
------------------ "
logIt "INFO" "No data for table. " "" "$tablename" "" 0
fi
fi
if [ "$reconSyncFlag" = "1" ]
then
echo "INFO : Load Recon data to S3 Started..."
logIt "INFO" "Load Recon data to S3 Started..." "" "" "" 0