0% found this document useful (0 votes)
5 views

Data Science Data Prep With SQL Quick Reference 1636560429

Data science
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Data Science Data Prep With SQL Quick Reference 1636560429

Data science
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

Data Science - Data Prep with SQL - Quick Reference

DATASET PROFILING CLEAN ATTRIBUTES


Volume SELECT COUNT(*) FROM t; Outliers SELECT CASE WHEN attr1 < 0 THEN 0 WHEN
(Quantitative) attr1 > 1000 THEN 1000 ELSE attr1 END as
Velocity SELECT t.date1, COUNT(*) attr1 FROM t;
FROM t GROUP by t.date1
ORDER BY t.date1 desc; Missing Values SELECT COALESCE(attr1,AVG(attr1) OVER ()),
(At Random) COALESCE (attr1,’Unknown’) FROM t;
Attribute SELECT attr1, attr2, attr3, attr4
FROM t; Missing Values SELECT COALESCE(attr1,0)
Selection FROM t;
(Not at Random)

Incomplete SELECT * FROM t


Incorrect Values SELECT REPLACE(attr1,’bad’,’good’)
Records WHERE t.attr1 IS NULL
FROM t;
AND t.attr2 IS NULL;

VALIDATE ATTRIBUTES DERIVE ATTRIBUTES


SELECT DISTINCT(attr1) FROM t; Buckets\Binning SELECT attr1, CASE WHEN attr1 <= 50
Domain THEN ‘bin1’ WHEN attr1 > 50 THEN ‘bin2’
ELSE ‘bin3’ END as attr1_bin FROM t;
Missing SELECT * FROM t
Values WHERE t.attr1 IS NULL; SELECT DAYOFMONTH(date1),
Date Parts
MONTHOFYEAER(date1) FROM t;
Range SELECT MIN(attr1), MAX(attr1),
AVG(attr1) FROM t; Date Difference SELECT DATEDIFF(date1,date2) FROM t;

Data Type SELECT * FROM Last Period SELECT DATEADD(year,-1,date1) FROM t;


information_schema.columns
WHERE table_name = ‘t’; Dummy Encoding SELECT attr1, CASE WHEN attr1 = ‘Male’
(One Hot) THEN 1 ELSE 0 as male_gender FROM t;
Outliers WITH dev_cte AS (
(95% confidence) SELECT STDDEV(attr1) sdev FROM t)
SELECT attr1, attr2 FROM t COMBINE DATASETS
CROSS JOIN dev_cte c
WHERE t.attr1 > c.sdev * 2; Join Horizontally SELECT t1.attr1, t2.attr2 FROM t1
(Full Match) INNER JOIN t2 ON t1.ID = t2.ID;
Distribution SELECT attr1,
WIDTH_BUCKET(attr1,100,500,5) Join Horizontally SELECT t1.attr1, t2.attr2 FROM t1
FROM t; (Optional Match) LEFT JOIN t2 ON t1.ID = t2.ID;

Union Vertically SELECT attr1, attr2 FROM t1


STANDARDIZE ATTRIBUTES (Deduplicate) UNION SELECT attr1, attr2 FROM t2

Data Types SELECT CAST(attr1 AS DATE), Union Vertically SELECT attr1, attr2 FROM t1
CAST(attr2 AS INT) FROM t; (No Deduplicate) UNION ALL SELECT attr1, attr2 FROM t2

Patterns SELECT CASE WHEN attr1 = …,


REPLACE(attr2,’Street’,’St’) FROM t; SPLIT DATASETS
Formatting SELECT UPPER(attr1), REPLACE(attr2,’- Simple Filter SELECT attr1, attr2 FROM t
’,’’) FROM t; WHERE attr1 IS NOT NULL;

Scaling SELECT attr1, attr2/(MAX(attr2) OVER Filter Based on SELECT attr1, SUM(attr2)
(PARTITION BY attr1)) FROM t; Aggregation FROM t GROUP BY attr1
HAVING SUM(attr2) > 10;

CREATE INTERFACE Sampling SELECT attr1, ROW_NUMBER() OVER


(Random) (ORDER BY RANDOM()) as random FROM t;
Create view CREATE VIEW AS SELECT…
Sampling SELECT attr1, NTILE(4) OVER (ORDER BY
(Non-Random) date()) as quartile FROM t;
Pugsley 2021

You might also like