# Import the Pandas library
import pandas as pd

# Import some toy data as a pandas DataFrame
df = pd.read_csv("https://ssciwr.github.io/jupyter-data-exploration/data.csv")

type(df)

pandas.core.frame.DataFrame

len(df)

20

# Display the first few rows of data
df.head()

# Display general DataFrame info (columns, entries, types)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           20 non-null     object 
 1   Age            20 non-null     int64  
 2   Sex            20 non-null     object 
 3   Height         20 non-null     float64
 4   Eye colour     20 non-null     object 
 5   Wears glasses  20 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.1+ KB

# A DataFrame is a bit like a Dictionary - we can lookup columns by name
names = df["Name"]

# A column of a DataFrame is a Series
type(names)

pandas.core.series.Series

names.head()

0       Bob
1     Simon
2     Clare
3      Jose
4    Hannah
Name: Name, dtype: object

# A Series is a bit like a List - we can select items by index
names[0]

'Bob'

# Here are the first three items:
names[0:3]

0      Bob
1    Simon
2    Clare
Name: Name, dtype: object

# Can also iterate over items
for name in names:
    print(name, "", end="")

Bob Simon Clare Jose Hannah Ryan Craig Suzy Chris Josie Claire John Agnes Robert Julia Fabian Joseph Roberta Chris Lucas

# alternative syntax: dataframe.column_name
# both of these are equivalent:
ages1 = df["Age"]
print(ages1.head())

ages2 = df.Age
print(ages2.head())

# note: this only works if the column label is a valid python object name, e.g. can't contain a space

0    12
1    13
2    15
3    11
4     9
Name: Age, dtype: int64
0    12
1    13
2    15
3    11
4     9
Name: Age, dtype: int64

# First row of data (column is implicitly "all" if not specified)
df.iloc[0]

Name               Bob
Age                 12
Sex               Male
Height           130.0
Eye colour        blue
Wears glasses      yes
Name: 0, dtype: object

# First row of data (using : slice operator to select all columns)
df.iloc[0, :]

Name               Bob
Age                 12
Sex               Male
Height           130.0
Eye colour        blue
Wears glasses      yes
Name: 0, dtype: object

# First column of data
df.iloc[:, 0].head()

0       Bob
1     Simon
2     Clare
3      Jose
4    Hannah
Name: Name, dtype: object

# Can select slices of rows and columns: e.g. first 3 rows, last 2 columns
df.iloc[0:3, -2:]

# Can also select a list of indices, e.g. rows 3,5,7, columns 3,5
df.iloc[[3, 5, 7], [3, 5]]

# Row with index label "0" (column is implicitly "all" if not specified)
df.loc[0]

Name               Bob
Age                 12
Sex               Male
Height           130.0
Eye colour        blue
Wears glasses      yes
Name: 0, dtype: object

# Row with index label "0" (using : slice operator to select all columns)
df.loc[0, :]

Name               Bob
Age                 12
Sex               Male
Height           130.0
Eye colour        blue
Wears glasses      yes
Name: 0, dtype: object

# "Name" column of data (using : slice operator to select all rows)
df.loc[:, "Name"].head()

0       Bob
1     Simon
2     Clare
3      Jose
4    Hannah
Name: Name, dtype: object

# Can also select a list of labels, e.g. index labels 3,5,7, columns "Height","Wears glasses"
df.loc[[3, 5, 7], ["Height", "Wears glasses"]]

# This returns True, as the condition 10 > 9 is true
10 > 9

True

# Similarly, this returns False, as the condition 8 > 9 is false
8 > 9

False

# Can do the same with a Series - returns a Boolean (true/false) Series
df["Age"] > 9

0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8      True
9     False
10     True
11    False
12    False
13    False
14    False
15    False
16    False
17     True
18    False
19    False
Name: Age, dtype: bool

# loc can take this as the selection, e.g. older than 9
df.loc[df["Age"] > 9]

# can combine conditions with & e.g. older than 9 and have blue eyes
df.loc[(df["Age"] > 9) & (df["Eye colour"] == "blue")]
# note: good idea to wrap each condition in brackets when combining them

# can have multiple conditions with | e.g. younger than 7 or wears glasses
df.loc[(df["Age"] < 7) | (df["Wears glasses"] == "yes")]

df["Eye colour"].describe()

count       20
unique       4
top       blue
freq         8
Name: Eye colour, dtype: object

df["Eye colour"].count()

np.int64(20)

df["Eye colour"].unique()

array(['blue', 'green', 'brown', 'grey'], dtype=object)

df["Eye colour"].value_counts()

Eye colour
blue     8
brown    8
grey     3
green    1
Name: count, dtype: int64

df["Eye colour"].hist()

<Axes: >

df["Height"].describe()

count     20.000000
mean     112.200000
std       23.283945
min       59.000000
25%      104.750000
50%      111.750000
75%      124.000000
max      158.000000
Name: Height, dtype: float64

df["Height"].hist()

<Axes: >

# import matplotlib

import matplotlib.pyplot as plt

# see how two columns are correlated with a scatter plot
df.plot.scatter(x="Age", y="Height")

<Axes: xlabel='Age', ylabel='Height'>

# do the same thing, but use matplotlib to customise the plot
# make a larger figure
fig, axs = plt.subplots(figsize=(12, 4))
# pass our axis to pandas plot
df.plot.scatter(x="Age", y="Height", ax=axs)
# set a title
plt.title("Height vs Age")
# display the plot
plt.show()

# filter the data before plotting, and plot multiple labelled datapoints
fig, axs = plt.subplots(figsize=(12, 4))
df.loc[df["Sex"] == "Male"].plot.scatter(
    x="Age", y="Height", ax=axs, label="Male", marker="x", color="green"
)
df.loc[df["Sex"] == "Female"].plot.scatter(
    x="Age", y="Height", ax=axs, label="Female", marker="o", color="blue"
)
plt.legend()
plt.title("Height vs Age")
plt.show()

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(16, 6))
df["Sex"].value_counts().plot.pie(ax=axs[0])
df["Wears glasses"].value_counts().plot.pie(ax=axs[1])
df["Eye colour"].value_counts().plot.pie(ax=axs[2])
plt.plot()

[]

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(16, 6))
fig.suptitle("OK asd asd")
fig.tight_layout()
for ax, column in zip(axs, ["Sex", "Wears glasses", "Eye colour"]):
    df[column].value_counts().plot.pie(ax=ax)
plt.plot()

[]

grouped = df.groupby(["Sex"])

type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

grouped.groups

{'Female': [2, 4, 7, 9, 10, 12, 14, 17], 'Male': [0, 1, 3, 5, 6, 8, 11, 13, 15, 16, 18, 19]}

for group, data in grouped:
    print(group)
    print(data.head())

('Female',)
      Name  Age     Sex  Height Eye colour Wears glasses
2    Clare   15  Female   142.5      green            no
4   Hannah    9  Female   111.0       blue           yes
7     Suzy   14  Female   137.0       grey           yes
9    Josie    8  Female   107.0      brown            no
10  Claire   16  Female   158.0       blue            no
('Male',)
    Name  Age   Sex  Height Eye colour Wears glasses
0    Bob   12  Male   130.0       blue           yes
1  Simon   13  Male   120.0       blue            no
3   Jose   11  Male   117.0      brown            no
5   Ryan   11  Male   124.0      brown            no
6  Craig   12  Male   124.0      brown            no

df.groupby(["Sex"])["Age"].count()

Sex
Female     8
Male      12
Name: Age, dtype: int64

df.groupby(["Sex"])["Age"].count().plot.pie()

<Axes: ylabel='Age'>

# equivalent "by hand" version
index = []
data = []
# split by sex
for group in df.Sex.unique():
    # apply "count" function to each subset
    count = df.loc[df.Sex == group].Age.count()
    # combine results back into a series
    index.append(group)
    data.append(count)
count_by_age = pd.Series(index=index, data=data, name="Age")

count_by_age

Male      12
Female     8
Name: Age, dtype: int64

# can group-by multiple columns
multigroup = df.groupby(["Eye colour", "Sex"]).Name.count()
multigroup

Eye colour  Sex   
blue        Female    4
            Male      4
brown       Female    1
            Male      7
green       Female    1
grey        Female    2
            Male      1
Name: Name, dtype: int64

# for plotting, unstack is helpful: converts our multi-index series into a dataframe with inner index as columns
multigroup.unstack()

multigroup.unstack().plot.pie(subplots=True, figsize=(18, 8))

array([<Axes: ylabel='Female'>, <Axes: ylabel='Male'>], dtype=object)

# custom apply function: count people older than 10 years old
def count_older_than_ten(series):
    return series.loc[series > 10].count()


df.groupby(["Sex"])["Age"].apply(count_older_than_ten)

Sex
Female    4
Male      5
Name: Age, dtype: int64

# same thing with a lambda instead of defining a function
df.groupby(["Sex"])["Age"].apply(lambda x: x.loc[x > 10].count())

Sex
Female    4
Male      5
Name: Age, dtype: int64

# display type of each column
df.dtypes

Name              object
Age                int64
Sex               object
Height           float64
Eye colour        object
Wears glasses     object
dtype: object

# display memory usage of each column
df.memory_usage(deep=True)

Index             132
Name             1081
Age               160
Sex              1076
Height            160
Eye colour       1069
Wears glasses    1025
dtype: int64

# list unique values in "Sex" column:
df["Sex"].unique()

array(['Male', 'Female'], dtype=object)

# see how much RAM is used to store this column as strings
df["Sex"].memory_usage(deep=True)

1208

# convert to a category type
df["Sex"] = df["Sex"].astype("category")

# list unique values in column:
df["Sex"].unique()

['Male', 'Female']
Categories (2, object): ['Female', 'Male']

# check RAM usage now
df["Sex"].memory_usage(deep=True)

368

# do the same for eye colour
df["Eye colour"] = df["Eye colour"].astype("category")

# list unique values to confirm Wears glasses is really a boolean:
df["Wears glasses"].unique()

array(['yes', 'no'], dtype=object)

# see how much RAM is used to store this column as strings
df["Wears glasses"].memory_usage(deep=True)

1157

# convert "yes" to True, "no" to False
df["Wears glasses"] = df["Wears glasses"].map({"yes": True, "no": False})
df["Wears glasses"].unique()

array([ True, False])

df["Wears glasses"].memory_usage(deep=True)

152

Part 2 - Pandas with toy data¶

Pandas¶

Selecting rows and columns¶

iloc¶

loc¶

Conditionals¶

Summarizing data¶

Plotting¶

Matplotlib¶

Groupby¶

Types¶

Next¶

	Name	Age	Sex	Height	Eye colour	Wears glasses
0	Bob	12	Male	130.0	blue	yes
1	Simon	13	Male	120.0	blue	no
2	Clare	15	Female	142.5	green	no
3	Jose	11	Male	117.0	brown	no
4	Hannah	9	Female	111.0	blue	yes