Python Cheatsheet.pptx
Python Cheatsheet.pptx
-import pandas as pd -pd.DataFrame(data as a dict) # creates a df -df.replace() import matplotlib.pyplot as plt -plt.ylabel(“y-axis name”) with open(“csv_filename”, “mode”) as file_pointer:
-import numpy as np with dict key as column name and value as the # df.replace(old value, new value) %matplotlib inline -plt.yticks(location, labels, rotation)
-import seaborn as sns column values # df.replace([list of old values], new # similar to plt.xticks() Read CSV
-import matplotlib.pyplot as plt value) # MANIPULATE DATAFRAME TO GET csv_pointer = csv.reader(file_pointer)
%matplotlib inline LOCATING/FILTERING VALUES IN A DF # df.replace([list of old values], [list of DESIRED DATA TO PLOT A GRAPH Save Figure and Show for each in csv_pointer:
-import os -df.loc [index number] # locate data at that new values]) -plt.savefig(“fig_name.png”) print(each) # Prints each row in the csv as a list of
-import csv index # df.replace({old value_1 : new fig, ax = plt.subplots(figsize = (16, 8)) -plt.show() comma separated values
-import datetime # can input [start : stop] to locate data at these value_1, old value_2 : new value_2}) # creates a figure and axes of size (16, 8)
indexes, including stop. Misc: Correlation Matrix Write CSV
DROP/DELETE ROWS AND COLUMNS # df.loc [start: stop , column_name] to locate ADDING NEW ROW/COLUMN TO DF Type of Plot -import seaborn as sns data = [data1, data2, data3]
-df.drop( [row indexes as a list] ) data at indexes start to stop for the column df.loc[new index number/name] = [list -plt.plot(x, y, label = “” , color = “”) -sns.pairplot(data) csv_pointer = csv.writer(file_pointer)
# can use np.arange(start, stop, step) to create the list name. column name can be [list of column of values] # line graph of x against y, label is to set # plots a correlation matrix of each column to one csv_pointer.writerow(data) # Writes data as a row or
of numbers names] # creates a new row at the given index what this graph is called, and color is to set another use writerows() and pass a list of data to be written
-df.drop( columns = [column names as a list] ) -df[ column name ] or creates a new row with given name a color e.g. “blue”, “green”, etc. for each row
# view column name’s data with the corresponding list of values. # Depending on kind of graph, might have Misc: Density Plot
DEALING WITH DUPLICATES -df.iloc[index number] List of values must match with number only x, e.g. hist() -df.plot(kind = “density”) MISCELLANEOUS FUNCTIONS
-df.duplicated() # same as df.loc but indexing doesn’t include of columns # e.g. plt.scatter(df2_tpg["Year"], # can be used alongside pyplot functions
# returns a series of True/False for duplicates ending value and for the column name portion, df[new column name] = [list of values] df2_tpg["Growth"], label = "Total Map (Substitute each value in a series with another)
# can input [column names as list] to find duplicates only can use indexing instead of column # creates a new column with given Population Growth“, color = “blue”) READ AND WRITE FILES e.g. df[“a”] = df[“a”].map({1: ”yes”, 4: ”No”, 7: ”No”})
across the column(s) name(s) name with the corresponding list of # To plot another graph on the same axes, with open(“file_name”, “mode”) as file_pointer:
# can input keep = False to mark all duplicates as True -df.loc[condition] values. List of values must match with just repeat this function with the e.g. with open(“temperature.csv”, “w”) as Rounding
or keep = “last” to mark last duplicate value as False, # filter out df based on condition number of rows corresponding values. E.g. file_pointer: e.g. ans2 = round(ans1, 2) -> 2 means 2 d.p
default is ‘first’ # condition can be e.g. df[“sales_latte”] > 10000 E.g. data["revenue"] = plt.scatter(df2_rpg["Year"], # if the function isn’t writing properly, can try input
-df.drop_duplicates() can use | for or, & for and, ~ for not data["sales_latte"] + df2_rpg["Growth"], label = "Resident newline = “” as well Store column’s unique value to list
# can input [column names as list] to drop columns data["sales_muffin"] Population Growth“, color = “orange”) e.g. lst = df.loc[:, “column_name"].unique().tolist()
within the column(s) GROUPING DATAFRAME Other kinds of plots other than ‘plot()’ Modes (Generally only need r, w and a)
# can input keep = False to drop all duplicates or keep -df.groupby(column name) EDITING VALUES IN COLUMNS bar(), barh(), scatter(), boxplot(), hist(), - r , Default mode. Opens the file and reads only Enumerate
= “last” to keep last duplicate value, default is ‘first’ # to group dataframe by the values in column e.g. df['host'] = "h” + pie() from the beginning e.g.
name df.host.astype(str) - Add “h” in front of - w , Opens in write-only mode and writes from the lst = ["a", "b", "c”]
DEALING WITH NA VALUES # can input [list of columns] to group data by every entry Legend (if needed) beginning, deleting everything. A new file is created for I, j in enumerate(lst):
-df.isna().sum() # returns the count of NA in each values in the columns. -df[column name].str.split() -plt.legend(loc = (1.04, 0)) # number can beif one with the same name does not exist print(i, j)
column # usually followed by another function e.g. e.g. df[['area1', 'neighbourhood1']] = changed to move the legend around - a , Opens a file for appending from the end. A new #Output :
-df.fillna(value) .sum(), .mean(), etc. df.neighbour_hood_info.str.split(",“ , file is created if one with the same name does not 0a
# fills NA values with what is input expand=True) Title exist 1b
# df.column_name.fillna(value) for individual column SORT VALUES # “,” indicates where to split the string -plt.title(“title”) - r+ , Opens a file for reading and writing from the 2c
-df.dropna() -df.sort_values(by = column name) in each value in the column beginning
# drop rows with at least 1 NA value # Default is ascending = True # expand = True places the split-up X-Axis - w+ , Opens a file for reading and writing Zip
# can input [column names as list] to drop NA in the # e.g. strings to each column, to column -plt.xlabel(“X-axis name”) - a+ , Opens a file for reading and appending e.g.
column(s) df.groupby('host_name’).number_of_reviews.s ”area1” and column “neighbourhood1” -plt.xticks(location, labels, rotation) a = ("John", "Charles", "Mike")
# can input thresh = (value) to drop row/column with um().sort_values(ascending=False)) - group # location is a list of index that will be the Functions b = ("Jenny", "Christy", "Monica", "Vicky")
at least (value) NA values. everything by host_name then sort based on INSERT HEADERS WHEN READING labels’ location on the axis -file_pointer.read(n) Print(list(zip(a, b))
number of reviews from largest to smallest FILES # labels is the list of corresponding labels # reads the entire file as a string. If n not specified, #Output :
FIND OUT ABOUT THE DATAFRAME header = [“header1”,”header2”] on the axis location given it reads the entire file as a string and returns it. [('John', 'Jenny'), ('Charles', 'Christy'), ('Mike',
-df.head() # view first 5 rows CHANGING VALUES IN DF df = pd.read_csv(“file_name.csv”, # rotation = 0 or 45 or 90, to change the -file_pointer.readline() 'Monica’)] ## list b is longer and thus the extra
-df.tail () # view the last 5 rows -df.loc[index, column name] = (value) names = header) rotation of the label # reads and returns the characters until the end of values are ignored and not used
-df.shape # returns (no. rows, no. columns) # can input range of index and list of column # e.g. plt.xticks(np.arange(12), the line is reached as a string.
-df.dtypes # returns datatypes of each column names to change their values. STATISTICAL FUNCTIONS calendar.month_name[1:13], rotation=20), -file_pointer.readlines() Datetime Functions
-df.index # if you have indexes that are named, use to -df.corr() -df.min() np.arange(12) creates a list from 0 to 11 # reads and returns all the lines as a list of strings. -datetime.date.today() # get today’s date
get a list of the index names, similar to columns. COUNT UNIQUE VALUES -df.mean() -df.max() and is used as the index location on the -For lines in file_pointer: -date = datetime.date(year, month, day) # create a
-df.columns # returns list of column names -df.value_counts() -df.var() axis print(lines) date # date.year , date.month, date.day to get year,
-df.describe() # returns a dataframe of statistics for # returns a series of the counts of unique rows -df.median() # if only 1 list is input into the function, it # loop to print out by file content by line. month and day
each column # can input [column names as list] -df.mode() will be used as labels -file_pointer.write(s) -timedelta(days = (No. Of days)) # can use to
df.corr(method = ‘pearson’) # returns a dataframe of # Default is sort = True and ascending = False -df.std() # writes the string s to the file add/subtract dates
correlation between each column -df.sum() -file_pointer.writelines(s) e.g. date – timedelta(days = 30)
# write all strings in the sequence s to the file
Operators Functions Command or Option Description SQL constraint Maths Operators List Functions
Steps to use SQL
1. start with creating database structure COUNT Returns the number of - NOT NULL constraint 1. Addition + -len(list_name): # returns the size of list
Comparison Operators # ensure that a column does not accept nulls 2. Subtraction –
- CREATE SCHEME AUTHORISATION - SELECT COUNT(*) rows with non-null -min(list_name): # returns the minimum
2. Create tables based on entities FROM MANAGER; values for a given - UNIQUE constraint 3. Multiplication * value in the list
- Definition of table structures: Providing the CREATE =, <, >, <=, >= , <> (not Used in conditional
column # ensures that all values in a column are unique 4. To the power of ** -max(list_name): # returns the maximum
TABLE command plus NAME of table then equal to ) , != (not expressions
MIN Returns the minimum - DEFAULT constraint 5. Float division / value in the list
(COLUMN_NAME DATATYPE(LENGTH) equal to)
- SELECT MIN( ) FROM attribute value found in # assigns a value to an attribute when new row is 6. Integer division // -sum(list_name): # returns the sum of all
CONSTRAINT,…) Logical Operators
MANAGER; a given column added to a table 7. Modulus / Remainder % values in the list
- E.g. CREATE TABLE MANAGER ( AND/OR/NOT Used in conditional
expressions - CHECK constraint -list_name.append(value) # append value
MANAGER_CODE INTEGER NOT NULL UNIQUE, MAX Returns the maximum
Special Operators # validates data when an attribute value is entered to list
NAME VARCHAR (50) NOT NULL, - SELECT MAX( ) FROM attribute value found in
BETWEEN Check whether an Np.array[[], [], []] -list_name.insert(index, value) # inserts
PRIMARY KEY (MANAGER_CODE)) ; MANAGER; a given column
3. Insert Data to table E.g. attribute value is Ordering the result value at given index into the list
- INSERT INTO TABLE_NAME VALUES (X,Y,Z); SELECT * FROM within a range SUM Returns the sum of all - ORDER BY Np.zeros(row, column) --list_name.remove(value) # remove first
MANAGER E.g. e.g. np.zeros((2, 1)) occurrence of value from list
- E.g. INSERT INTO MANAGER VALUES (1002, WHERE - SELECT SUM ( ) AS () values for a given
“Mary”) ; PORTFOLIO_VALUE FROM MANAGER; column SELECT NAME, PORTFOLIO_VALUE FROM -list_name.pop(index) # remove value at
- Missing value: If null is allowed, add in the missing BETWEEN 30000 AND AVG Returns the average of MANAGER Np.ones(row, column) given index from list
value as NULL 50000; - SELECT AVG () FROM all values for a given ORDER BY PORTFOLIO_VALUE; -list_name.sort() # sorts the list, can input
MANAGER column Np.reshape((shape)) reverse = True to sort in descending order
- INSERT INTO TABLE_NAME(COL_NAME, IS NULL Check whether an Command or Option Description - ORDER BY, DESC
COL2_NAME), VALUES (X,Y); attribute value is null E.g. Dictionary Functions
- E.g. INSERT INTO MANAGER LIKE Check whether an INSERT Insert row into a table SELECT NAME, PORTFOLIO_VALUE FROM -dict.items() # returns key and value pairs
(MANAGER_CODE,NAME) VALUES (1003, “Jenny”) ; E.g. attribute value SELECT Select attributes from
MANAGER in a list
SELECT * FROM matches a given string rows
ORDER BY PORTFOLIO_VALUE DESC; -dict.keys() and dict.values() shows keys
4. Querying Data MANAGER pattern WHERE Restricts the selection
of rows based on a and values respectively
- SELECT COLUMN_NAME FROM TABLE_NAME; WHERE NAME LIKE
“JE%”; conditional expression WHERE clause -dict[key] # returns the value for given key
- E.g. SELECT MANAGER_CODE, NAME FROM
IN Checks whether an GROUP BY Groups selected rows - where condition
MANAGER;
E.g. attribute value - E.g. based on one or more E.g. String Functions
To List out all fields: SELECT NAME FROM matches any value SELECT COUNT (*), attributes SELECT NAME, PORTFOLIO_VALUE FROM
- SELECT * FROM MANAGER MANAGER WHERE within a value list DEPARTMENT FROM MANAGER .lower()/.upper()/. title() is for Title
MANAGER_CODE IN ( ) MANAGER WHERE PORTFOLIO_VALUE >= 50000 title() Case
To select unique values: EXISTS Checks whether a GROUP BY ORDER BY PORTFOLIO_VALUE;
- SELECT DISTINCT NAME FROM MANAGER subquery returns any DEPARTMENT; .isalnum()
rows HAVING Restricts the selection To rename a column Returns True if all
DISTINCT Limits values to unique of grouped rows based .isdecimal() characters in string
Changing Title and Sort - SELECT PORTFOLIO_VALUE, PORTFOLIO_VALUE *
SELECT Movie_title AS Movies values on a condition 1.3 AS USD_EQV FROM TABLE NAME are as such.
ORDER BY Orders the selected .isalpha()
FROM Movie
rows based on one or
ORDER BY Movie_title; Listing First and Last Name Who Never HAVING + GROUPBY .isnumeric()
more attributes
Ordered Movie 3 - SELECT COUNT(*),DEPARTMENT FROM MANAGER
UPDATE Modifies an attribute’s .rstrip()/.lstrip()/.s Remove
Changing Title and Listing Runtime > 120 SELECT DISTINCT Cus_Fname, Cus_Lname values in one or more GROUP BY DEPARTMENT
SELECT Movie_title AS Movies FROM Customer HAVING COUNT(*) >= 2; trip() whitespaces
table’s rows
FROM Movie WHERE NOT (Cus_Fname = “Baltimore” AND DELETE Delete one or more .split() Splits string
WHERE Movie_Runtime > 120 Cus_Lname = “Aliza”); rows from a table JOINING TABLES
ORDER BY Movie_title; COMMIT Permanently saves data - SELECT * FROM MANAGER, EXECUTIVE Strings can be added and multiplied
Listing First and Last Name Who Ordered changes WHERE MANAGER.MANAGER_CODE =
Select Unique Movie ID from Carey Dopico Movie 3 ROLLBACK Restores data to its EXECUTIVE.MANAGER_CODE;
SELECT DISTINCT Movie_ID SELECT DISTINCT Cus_Fname, Cus_Lname original values
FROM Rental FROM Customer OUTER JOIN JOIN CLAUSE
INNER JOIN Customer ON Customer.Cus_No = INNER JOIN Rental ON - LEFT JOIN: Left table records will be shown Inner join: combines the results similar to AND
Rental.Cus_No Customer.Cus_No = Rental.Cus_No each for non-matching results operation look for values common to both tables
WHERE Customer.Cus_Lname = “Carey” AND WHERE Movie_ID = 3; - RIGHT JOIN: Right table records will be shown Outer join: keeps the non-matching results when
Customer.Cus_Fname = “Dopico”; even for on-matching results join is done
E.g. SELECT * FROM MANAGER M RIGHT JOIN
Count How Many Movies in Each Genre EXECUTIVE E INNER JOIN
SELECT Movie_Genre, COUNT(*) on M.MANAGER_CODE = E.MANAGER_CODE; E.g. SELECT * FROM MANAGER M INNER JOIN
FROM Movie - FULL JOIN: Join left and right tables all records EXECUTIVE E
GROUP BY Movie_Genre; ON M.MANAGER_CODE = E.MANAGER_CODE