Sqoop Demo
Sqoop Demo
Sqoop Demo
--connect "jdbc:mysql://quickstart.cloudera:3306" \
--username retail_dba \
--password cloudera
sqoop list-tables \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username retail_dba \
--password cloudera
sqoop eval \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username retail_dba \
--password cloudera \
--query "select count(1) from order_items"
-- Reference:
http://www.cloudera.com/content/cloudera/en/developers/home/developer-admin-
resources/get-started-with-hadoop-tutorial/exercise-1.html
sqoop import-all-tables \
-m 12 \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--as-avrodatafile \
--warehouse-dir=/user/hive/warehouse/retail_stage.db
--Default
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--as-textfile \
--target-dir=/user/cloudera/departments
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--as-sequencefile \
--target-dir=/user/cloudera/departments
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--as-avrodatafile \
--target-dir=/user/cloudera/departments
-- A file with extension avsc will be created under the directory from which sqoop
import is executed
-- Copy avsc file to HDFS location
-- Create hive table with LOCATION to /user/cloudera/departments and TBLPROPERTIES
pointing to avsc file
hadoop fs -put sqoop_import_departments.avsc /user/cloudera
CREATE EXTERNAL TABLE departments
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs:///user/cloudera/departments'
TBLPROPERTIES
('avro.schema.url'='hdfs://quickstart.cloudera/user/cloudera/sqoop_import_departmen
ts.avsc');
sqoop import-all-tables \
--num-mappers 1 \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--hive-import \
--hive-overwrite \
--create-hive-table \
--compress \
--compression-codec org.apache.hadoop.io.compress.SnappyCodec \
--outdir java_files
-- Basic import
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--target-dir /user/cloudera/departments
-- Importing table with out primary key using multiple threads (split-by)
-- When using split-by, using indexed column is highly desired
-- If the column is not indexed then performance will be bad
-- because of full table scan by each of the thread
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--target-dir /user/hive/warehouse/retail_ods.db/departments \
--append \
--fields-terminated-by '|' \
--lines-terminated-by '\n' \
--split-by department_id \
--outdir java_files
-- Incremental load
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--target-dir /user/hive/warehouse/retail_ods.db/departments \
--append \
--fields-terminated-by '|' \
--lines-terminated-by '\n' \
--check-column "department_id" \
--incremental append \
--last-value 7 \
--outdir java_files
-- Hive related
-- Overwrite existing data associated with hive table (hive-overwrite)
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--fields-terminated-by '|' \
--lines-terminated-by '\n' \
--hive-home /user/hive/warehouse/retail_ods.db \
--hive-import \
--hive-overwrite \
--hive-table departments \
--outdir java_files
--Initial load
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--as-textfile \
--target-dir=/user/cloudera/sqoop_merge/departments
--Validate
sqoop eval --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username retail_dba \
--password cloudera \
--query "select * from departments"
--update
sqoop eval --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username retail_dba \
--password cloudera \
--query "update departments set department_name='Testing Merge' where
department_id = 9000"
--Insert
sqoop eval --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username retail_dba \
--password cloudera \
--query "insert into departments values (10000, 'Inserting for merge')"
--New load
sqoop import \
--connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
--username=retail_dba \
--password=cloudera \
--table departments \
--as-textfile \
--target-dir=/user/cloudera/sqoop_merge/departments_delta \
--where "department_id >= 9000"
--Merge
sqoop merge --merge-key department_id \
--new-data /user/cloudera/sqoop_merge/departments_delta \
--onto /user/cloudera/sqoop_merge/departments \
--target-dir /user/cloudera/sqoop_merge/departments_stage \
--class-name departments \
--jar-file <get_it_from_last_import>