0% found this document useful (0 votes)
25 views

Resumão - SQL Com Databricks

The document provides a cheat sheet overview of common SQL commands and functions for Databricks SQL (DBSQL) including creating and modifying tables, inserting and updating data, joins, aggregations, and Delta Lake features like change data capture and cloning tables.

Uploaded by

Cristiano Silva
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views

Resumão - SQL Com Databricks

The document provides a cheat sheet overview of common SQL commands and functions for Databricks SQL (DBSQL) including creating and modifying tables, inserting and updating data, joins, aggregations, and Delta Lake features like change data capture and cloning tables.

Uploaded by

Cristiano Silva
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

DBSQL CHEATSHEET DELETE / DROP A TABLE JOINS

DELETE JOIN
Databricks SQL (DB SQL) is a serverless data --Join two tables (via inner, outer, left, or right join)
--Delete rows in a table based upon a condition
warehouse on the Databricks Lakehouse Platform DELETE FROM sales SELECT city.name, country.name
that lets you run all your SQL and BI applications at WHERE predicate; FROM city
[INNER|OUTER|LEFT|RIGHT] JOIN country
scale with up to 12x better price/performance, a DROP TABLE ON city.country_id = country.id;

unified governance model, open formats and APIs, DROP TABLE [IF EXISTS] sales;

and your tools of choice – no lock-in. TRUNCATE COMMON SELECT QUERIES


--Keep a table but delete all of its data.
CREATE TABLES TRUNCATE TABLE sales; SUBQUERIES
--Query an intermediate result set using a subquery.
CREATE TABLE SELECT * FROM sales
WHERE sales_id IN (
--Create a table and define its schema.
CREATE TABLE default.sales ( ADD/MODIFY DATA SELECT DISTINCT sales_id
FROM visit
transaction_datetime TIMESTAMP,
UPDATE );
refund_datetime TIMESTAMP,
bank_zip INT, --Update column values for rows that match a predicate ALIAS COLUMN
customer_zip INT UPDATE sales
); --Alias a column
SET bank_office = 'Augusta' SELECT sales_id AS sales_id_new
WHERE employee_state = 'Maine';
CREATE VIEW FROM sales;

CREATE VIEW mytempview INSERT INTO ALIAS TABLE


AS SELECT * FROM default.sales; --Alias a table
--Insert comma separated values directly into a table.
INSERT [OVERWRITE] INTO mytable VALUES SELECT * FROM my_sales AS m;
CREATE OR REPLACE TABLE ('Harper Bryant', 'Employee', 98101),
CREATE OR REPLACE TABLE default.sales ('Sara Brown', 'Contractor', 48103); ORDER BY
parquet.`/path/to/data`; --Return a table sorted by a column's values. Values
MERGE INTO returned in ascending order by default, or specify DESC.
SELECT productname, sales_id FROM sales
--Upsert (update + insert) using MERGE
ORDER BY sales_id [DESC];
MERGE INTO target
ALTER TABLE USING updates
WHERE
ON target.Id = updates.Id
RENAME TABLE WHEN MATCHED AND target.delete_flag = "true" THEN --Filter a table based upon rows that match one or more
DELETE specific predicates (text or numeric filtering)
ALTER TABLE sales WHEN MATCHED THEN SELECT * FROM sales
RENAME TO salesperson; UPDATE SET * WHERE product_name = "Lego set" AND sales_id > 50000;
WHEN NOT MATCHED THEN
RENAME COLUMN INSERT (date, Id, data) -- or, use INSERT * JSON
ALTER TABLE sales VALUES (date, Id, data); --extract values from a JSON string using the : operator,
RENAME COLUMN customer_first_name TO customer_name; delimeters and identifiers
SELECT raw:owner, raw:OWNER, raw:['owner'], raw:['OWNER']
ADD COLUMNS FROM sales;
ALTER TABLE sales ADD columns (time TIMESTAMP, col_name1 IDENTITY COLUMNS --Extract nested fields from JSON string using the
data_type2); : operator and dot notation
AUTO-INCREMENTING IDENTITY COLUMNS SELECT raw:store.bicycle FROM sales;
CHECK (CONSTRAINTS)
--Add an auto-incrementing identity column --Extract values from an array in JSON using the
--Add a CHECK constraint CREATE TABLE sales : operator
ALTER TABLE sales (id BIGINT GENERATED ALWAYS AS IDENITY COMMENT 'Surrogate SELECT raw:store.fruit[0], raw:store.fruit[1] FROM sales;
ADD CONSTRAINT dateWithinRange CHECK (year > '2000-01- key for AccountID',
01'); accountid BIGINT, CLONE
samplecolumn STRING -- Deep clone is a complete, independent copy of the source
NOT NULL (CONSTRAINTS) ); table
CREATE OR REPLACE TABLE default.sales DEEP CLONE
--Add a NOT NULL constraint SHOW IDENTITY COLUMNS parquet.`/path/to/data`;
ALTER TABLE sales --Returns the CREATE TABLE statement that was used to -- Shallow clone is a copy of the source table’s definition,
ADD CONSTRAINT customer_name IS NOT NULL; create a given table or view. Allows you to see which but refers to the source table’s files
column(s) are identity columns. CREATE OR REPLACE TABLE default.sales SHALLOW CLONE
DROP CONSTRAINT (CONSTRAINTS)
SHOW CREATE TABLE sales; parquet.`/path/to/data`;
ALTER TABLE default.sales
DROP CONSTRAINT dateWithinRange;
DBSQL CHEATSHEET DELTA LAKE PERFORMANCE TUNING
CHANGE DATA FEED CACHE
COMMON AGGREGATIONS --Read table changes starting at a specified version number --Cache a table in memory to speed up queries.
COUNT SELECT * FROM table_changes('sales', <start version #>) CACHE SELECT sales;
--Enable Change Data Feed on Delta Lake table
--View count of distinct records in a table
ALTER TABLE sales SET TBLPROPERTIES
EXPLAIN
SELECT COUNT([DISTINCT] sales) --View the physical plan for execution of a given SQL
FROM orderhistory; (delta.enableChangeDataFeed = true);
statement.
CONVERT TO DELTA EXPLAIN [EXTENDED] SELECT * FROM sales;
AVERAGE/MIN/MAX
--Convert a table to Delta Lake format
--View average (mean), sum, or min and max values in a
TUNE WIDE TABLES
CONVERT TO DELTA sales;
column --Sets the number of columns to collect statistics on
SELECT AVG(sales), SUM(sales), MIN(sales), MAX(sales) VACUUM ALTER TABLE SET TBLPROPERTIES
FROM orderhistory; (‘delta.dataSkippingNumIndexedCols' = 64);
--Delete files no longer used by the table from cloud
storage
GROUP BY/HAVING OPTIMIZE
VACUUM sales [RETAIN num HOURS] [DRY RUN];
--View an aggregation grouped by a column's values. --OPTIMIZE Delta tables, bin packs tables for better
Optionally, specify a predicate using the HAVING clause TIME TRAVEL performance
that rows must match to be included in the aggregation. --Query historical versions of a Delta Lake table by OPTIMIZE sales
version number or timestamp
SELECT SUM(sales) ANALYZE
FROM orderhistory SELECT * FROM table_name [VERSION AS OF 0 | TIMESTAMP AS
OF "2020-12-18"] --Analyze table to collect statistics on entire column
GROUP BY country
ANALYZE TABLE sales COMPUTE STATISTICS FOR ALL COLUMNS;
[HAVING item_type="soup"]; --View Delta Lake transaction log (table history)
DESCRIBE HISTORY sales; OPTIMIZE/ZORDER
--Periodic OPTIMIZE and ZORDER, run on a nightly basis
DESCRIBE
OPTIMIZE customer_table ZORDER BY customer_id, customer_seq;
PERMISSIONS --View [detailed] information about a database or table
DESCRIBE [DETAIL] sales;
GRANT
-- Grant database and table permissions for admin group DATA INGESTION
GRANT ALL PRIVILEGES ON [DATABASE default|TABLE sales] TO GEOSPATIAL FUNCTIONS COPY INTO
`[email protected]`| admins;
H3 COPY INTO iot_devices
REVOKE --Returns the H3 cell ID (as a BIGINT) corresponding to the FROM "/databricks-datasets/iot/"
provided longitude and latitude at the specified resolution FILEFORMAT = JSON|CSV|PARQUET|etc.;
--Revoke privileges on databases or tables
SELECT h3_longlatash3(longitudeExpr, latitudeExpr,
REVOKE [SELECT TABLE|ALL PRIVILEGES|CREATE TABLE|etc.] ON
resolutionExpr)
sales FROM [`[email protected]`|admins];
--Returns an ARRAY of H3 cell IDs (represented as a BIGINTs) CREATE FUNCTION
SHOW GRANT corresponding to hexagons or pentagons, of the specified
resolution, that are contained by the input areal geography CREATE FUNCTION
--Show a user's permissions on a table SELECT h3_polyfillash3(geographyExpr, resolutionExpr)
SHOW GRANT `[email protected]` ON TABLE default.sales; -- Create a permanent function with parameters.
--Returns the H3 cell IDs that are within (grid) distance k CREATE FUNCTION area(x DOUBLE, y DOUBLE) RETURNS DOUBLE
of the origin cell ID RETURN x * y;
SELECT 3_kring(h3CellIdExpr, kExpr)
-- Use a SQL function in the SELECT clause of a query.
INFORMATION SCHEMA --Returns the grid distance of the two input H3 cell IDs
SELECT h3_distance(h3CellId1Expr, h3CellId2Expr)
SELECT area(c1, c2) AS area FROM t;
-- Use a SQL function in the WHERE clause of a query.
INFORMATION SCHEMA --Returns the parent H3 cell ID of the input H3 cell ID at
SELECT * FROM t WHERE area(c1, c2) > 0;
the specified resolution
--View all tables that have been created in the last 24 SELECT h3_toparent(h3CellIdExpr, resolutionExpr) -- Compose SQL functions.
hours CREATE FUNCTION square(x DOUBLE) RETURNS DOUBLE RETURN
SELECT table_name, table_owner, created_by, last_altered, area(x, x);
last_altered_by, table_catalog
FROM system.information_schema.tables CTE SELECT c1, square(c1) AS square FROM t

WHERE datediff(now(), last_altered) < 1; - Create a non-deterministic function


CTE CREATE FUNCTION roll_dice()
--View how many tables you have in each schema RETURNS INT
--Create a common table expression (CTE) that can be
SELECT table_schema, count(table_name) NOT DETERMINISTIC
easily reused in other queries.
FROM system.information_schema.tables CONTAINS SQL
WITH common_table_expression_name
WHERE table_schema = 'tpch' COMMENT 'Roll a single 6 sided die'
AS (
GROUP BY table_schema RETURN (rand() * 6)::INT + 1;
SELECT
ORDER BY 2 DESC
product_name as product, -- Rol
l a single 6-sided die
AVG(sales) as avg_sales
USE SELECT roll_dice();
FROM orderhistory
--Switch to a different database; the database default is GROUP BY product
used if none is specified. Provided to the open source community by Databricks
)
USE database_name; SELECT * FROM common_table_expression_name ©️Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo
are trademarks of the Apache Software Foundation.

You might also like