Polars
A Python library for data frames – tabular data with named columns. Similar to Pandas, but newer and written in Rust. Unlike in Polars, data frames are immutable.
Cheatsheet
Core data types:
pl.DataFrame
– a table of datapl.Series
– a homogeneous sequence of values
Columns of a pl.DataFrame
are of type pl.Series
.
import polars as pl
# Create a data frame with initial data
df = pl.DataFrame(
{
"name": ["New York", "Los Angeles", "Chicago"],
"state": ["New York", "California", "Illinois"],
"population": [8419600, 3980400, 2716000],
}
)
# Create an empty data frame with types
df = pl.DataFrame(schema={"time": pl.Datetime, "quantity": pl.Float64})
# other useful types:
# - pl.Int64
# Columns support aggregation functions
print(df["population"].sum())
print(df["population"].mean())
print(df["name"].min())
# Access individual rows by index
print(df[0]) # this is itself a `pl.DataFrame`
# ...or by searching
print(df.filter(pl.col("name") == "Chicago"))
# Add another row (remember that data frames are immutable)
df = pl.concat(
[
df,
pl.DataFrame(
dict(name=["Houston"], state=["Texas"], population=2304000)
)
]
)
# Reassign a column value
df.with_columns(pl.when(df["name"] == "New York").then(9000000).otherwise(df["population"]).alias("population"))
# Plot a graph
import matplotlib.pyplot as plt
timeseries = pl.DataFrame(dict(time=[1,2,3], n=[10, 20, 40]))
plt.plot(timeseries["time"].to_numpy(), timeseries["quantity"].to_numpy())
plt.show()