Python Cheatsheets Ds
Python Cheatsheets Ds
1
Boxplot yticks=[0,2.5,5])
Data Also see Lists, NumPy & Pandas >>> sns.boxplot(x="alive", Boxplot
Plot
y="age",
>>> import pandas as pd hue="adult_male",
>>> import numpy as np >>> plt.title("A Title") Add plot title
data=titanic)
>>> uniform_data = np.random.rand(10, 12) >>> plt.ylabel("Survived") Adjust the label of the y-axis
>>> sns.boxplot(data=iris,orient="h") Boxplot with wide-form data
>>> data = pd.DataFrame({'x':np.arange(1,101), >>> plt.xlabel("Sex") Adjust the label of the x-axis
'y':np.random.normal(0,4,100)}) Violinplot >>> plt.ylim(0,100) Adjust the limits of the y-axis
>>> sns.violinplot(x="age", Violin plot >>> plt.xlim(0,10) Adjust the limits of the x-axis
Seaborn also offers built-in data sets: y="sex", >>> plt.setp(ax,yticks=[0,5]) Adjust a plot property
>>> titanic = sns.load_dataset("titanic") hue="survived", >>> plt.tight_layout() Adjust subplot params
>>> iris = sns.load_dataset("iris") data=titanic)
Python lists, NumPy arrays, Pandas DataFrames and other sequences of values
2. Create a new plot
>>> color_mapper = CategoricalColorMapper(
factors=['US', 'Asia', 'Europe'],
palette=['blue', 'red', 'green'])
4 Output & Export
3. Add renderers for your data, with visual customizations >>> p3.circle('mpg', 'cyl', source=cds_df, Notebook
color=dict(field='origin',
4. Specify where to generate the output transform=color_mapper), >>> from bokeh.io import output_notebook, show
5. Show or save the results legend='Origin') >>> output_notebook()
>>> from bokeh.plotting import figure
>>> from bokeh.io import output_file, show Legend Location HTML
>>> x = [1, 2, 3, 4, 5] Step 1
>>> y = [6, 7, 2, 4, 5] Inside Plot Area Standalone HTML
>>> p = figure(title="simple line example", Step 2 >>> p.legend.location = 'bottom_left' >>> from bokeh.embed import file_html
>>> from bokeh.resources import CDN
x_axis_label='x',
>>> html = file_html(p, CDN, "my_plot")
y_axis_label='y') Outside Plot Area
>>> p.line(x, y, legend="Temp.", line_width=2) Step 3 >>> from bokeh.models import Legend
>>> r1 = p2.asterisk(np.array([1,2,3]), np.array([3,2,1]) >>> from bokeh.io import output_file, show
>>> output_file("lines.html") Step 4 >>> r2 = p2.line([1,2,3,4], [3,4,5,6]) >>> output_file('my_bar_chart.html', mode='cdn')
>>> show(p) Step 5 >>> legend = Legend(items=[("One" ,[p1, r1]),("Two",[r2])],
location=(0, -30)) Components
1 Data Also see Lists, NumPy & Pandas
>>> p.add_layout(legend, 'right')
Legend Orientation
>>> from bokeh.embed import components
>>> script, div = components(p)
Under the hood, your data is converted to Column Data
Sources. You can also do this manually: >>> p.legend.orientation = "horizontal" PNG
>>> import numpy as np >>> p.legend.orientation = "vertical"
>>> from bokeh.io import export_png
>>> import pandas as pd >>> export_png(p, filename="plot.png")
>>> df = pd.DataFrame(np.array([[33.9,4,65, 'US'], Legend Background & Border
[32.4,4,66, 'Asia'],
[21.4,4,109, 'Europe']]), >>> p.legend.border_line_color = "navy" SVG
columns=['mpg','cyl', 'hp', 'origin'], >>> p.legend.background_fill_color = "white"
index=['Toyota', 'Fiat', 'Volvo']) >>> from bokeh.io import export_svgs
>>> from bokeh.models import ColumnDataSource Rows & Columns Layout >>> p.output_backend = "svg"
>>> export_svgs(p, filename="plot.svg")
>>> cds_df = ColumnDataSource(df) Rows
>>> from bokeh.layouts import row
From RDDs
>>> df.select("firstName", Show firstName, and lastName is
df.lastName.like("Smith")) \ TRUE if lastName is like Smith
Repartitioning
.show()
>>> from pyspark.sql.types import * Startswith - Endswith >>> df.repartition(10)\ df with 10 partitions
>>> df.select("firstName", Show firstName, and TRUE if .rdd \
Infer Schema .getNumPartitions()
>>> sc = spark.sparkContext df.lastName \ lastName starts with Sm
.startswith("Sm")) \ >>> df.coalesce(1).rdd.getNumPartitions() df with 1 partition
>>> lines = sc.textFile("people.txt")
.show()
>>> parts = lines.map(lambda l: l.split(",")) >>> df.select(df.lastName.endswith("th")) \ Show last names ending in th
>>>
>>>
people = parts.map(lambda p: Row(name=p[0],age=int(p[1])))
peopledf = spark.createDataFrame(people)
.show() Running SQL Queries Programmatically
Substring
Specify Schema >>> df.select(df.firstName.substr(1, 3) \ Return substrings of firstName Registering DataFrames as Views
>>> people = parts.map(lambda p: Row(name=p[0], .alias("name")) \
age=int(p[1].strip()))) .collect() >>> peopledf.createGlobalTempView("people")
>>> schemaString = "name age" Between >>> df.createTempView("customer")
>>> fields = [StructField(field_name, StringType(), True) for >>> df.select(df.age.between(22, 24)) \ Show age: values are TRUE if between >>> df.createOrReplaceTempView("customer")
field_name in schemaString.split()] .show() 22 and 24
>>> schema = StructType(fields) Query Views
>>> spark.createDataFrame(people, schema).show()
+--------+---+
| name|age|
Add, Update & Remove Columns >>> df5 = spark.sql("SELECT * FROM customer").show()
+--------+---+ >>> peopledf2 = spark.sql("SELECT * FROM global_temp.people")\
|
|
Mine| 28|
Filip| 29|
Adding Columns .show()
|Jonathan| 30|
+--------+---+ >>> df = df.withColumn('city',df.address.city) \
.withColumn('postalCode',df.address.postalCode) \
From Spark Data Sources .withColumn('state',df.address.state) \
.withColumn('streetAddress',df.address.streetAddress) \
Output
.withColumn('telePhoneNumber', Data Structures
JSON explode(df.phoneNumber.number)) \
>>> df = spark.read.json("customer.json") .withColumn('telePhoneType',
>>> df.show() >>> rdd1 = df.rdd Convert df into an RDD
+--------------------+---+---------+--------+--------------------+ explode(df.phoneNumber.type)) >>> df.toJSON().first() Convert df into a RDD of string
| address|age|firstName |lastName| phoneNumber|
+--------------------+---+---------+--------+--------------------+ >>> df.toPandas() Return the contents of df as Pandas
|[New York,10021,N...| 25|
|[New York,10021,N...| 21|
John|
Jane|
Smith|[[212 555-1234,ho...|
Doe|[[322 888-1234,ho...|
Updating Columns DataFrame
+--------------------+---+---------+--------+--------------------+
>>> df2 = spark.read.load("people.json", format="json")
>>> df = df.withColumnRenamed('telePhoneNumber', 'phoneNumber') Write & Save to Files
Parquet files Removing Columns >>> df.select("firstName", "city")\
>>> df3 = spark.read.load("users.parquet") .write \
TXT files >>> df = df.drop("address", "phoneNumber") .save("nameAndCity.parquet")
>>> df4 = spark.read.text("people.txt") >>> df = df.drop(df.address).drop(df.phoneNumber) >>> df.select("firstName", "age") \
.write \
.save("namesAndAges.json",format="json")
Inspect Data
>>> df.dtypes Return df column names and data types >>> df.describe().show() Compute summary statistics Stopping SparkSession
>>> df.show() Display the content of df >>> df.columns Return the columns of df
>>> df.count() >>> spark.stop()
>>> df.head() Return first n rows Count the number of rows in df
>>> df.first() Return first row >>> df.distinct().count() Count the number of distinct rows in df
>>> df.take(2) Return the first n rows >>> df.printSchema() Print the schema of df DataCamp
>>> df.schema Return the schema of df >>> df.explain() Print the (logical and physical) plans
Learn Python for Data Science Interactively