# import the Pandas library & matplotlib for plotting

import pandas as pd
import matplotlib.pyplot as plt

# download an excel spreadsheet with some data and convert it to a DataFrame
url = "https://data.london.gov.uk/download/animal-rescue-incidents-attended-by-lfb/01007433-55c2-4b8a-b799-626d9e3bc284/Animal%20Rescue%20incidents%20attended%20by%20LFB%20from%20Jan%202009.csv.xlsx"
df = pd.read_excel(url)

df

df.dtypes

IncidentNumber                        object
DateTimeOfCall                datetime64[ns]
CalYear                                int64
FinYear                               object
TypeOfIncident                        object
PumpCount                            float64
PumpHoursTotal                       float64
HourlyNotionalCost(£)                  int64
IncidentNotionalCost(£)              float64
FinalDescription                      object
AnimalGroupParent                     object
OriginofCall                          object
PropertyType                          object
PropertyCategory                      object
SpecialServiceTypeCategory            object
SpecialServiceType                    object
WardCode                              object
Ward                                  object
BoroughCode                           object
Borough                               object
StnGroundName                         object
UPRN                                 float64
Street                                object
USRN                                 float64
PostcodeDistrict                      object
Easting_m                            float64
Northing_m                           float64
Easting_rounded                        int64
Northing_rounded                       int64
Latitude                             float64
Longitude                            float64
dtype: object

df["DateTimeOfCall"].head()

0   2009-01-01 03:01:00
1   2009-01-01 08:51:00
2   2009-01-04 10:07:00
3   2009-01-05 12:27:00
4   2009-01-06 15:23:00
Name: DateTimeOfCall, dtype: datetime64[ns]

# this is already a datetime object, which is great
# a quick sanity check to see if it looks correct:
pd.to_datetime(df["DateTimeOfCall"]).plot()
# should be a single monotonically increasing line: looks good!

<Axes: >

df.set_index("DateTimeOfCall", inplace=True)

df

# can now use datetime to select rows: here is jan 2021
df.loc["2021-01-01":"2021-01-31", "FinalDescription"]

DateTimeOfCall
2021-01-01 12:09:00        KITTEN STUCK UP TREE  AL REQUESTED FROM SCENE
2021-01-01 14:06:00                                             Redacted
2021-01-03 18:40:00                CAT WITH LEG TRAPPED IN BATH PLUGHOLE
2021-01-04 13:39:00                                             Redacted
2021-01-06 10:22:00                                             Redacted
2021-01-06 13:09:00    CAT IN DISTRESS ON ROOF - ADDITIONAL APPLIANCE...
2021-01-06 20:35:00        DOG TRAPPED IN FOX HOLE  - MEET AT CLUB HOUSE
2021-01-07 23:50:00                   KITTEN STUCK BETWEEN WALL AND ROOF
2021-01-09 08:01:00                                  DOG STUCK IN TRENCH
2021-01-10 19:27:00                                             Redacted
2021-01-12 11:39:00                                             Redacted
2021-01-12 22:38:00                                 CAT TRAPPED IN DITCH
2021-01-16 18:05:00                          DOG TRAPPED IN PORTER CABIN
2021-01-17 16:09:00    DOG TRAPPED IN WAREHOUSE AREA - CALLER BELIEVE...
2021-01-17 17:09:00      BIRD TRAPPED IN NETTING    CALLER WILL MEET YOU
2021-01-18 15:17:00            CAT STUCK IN TREE BEING ATTACKED BY CROWS
2021-01-18 17:06:00    ASSIST RSPCA - SMALL ANIMAL RESUE - BIRD ENTAN...
2021-01-19 18:28:00                          CAT TRAPPED BEHIND CUPBOARD
2021-01-19 20:24:00                                             Redacted
2021-01-19 20:36:00                              RUNNING CALL AT ON ROOF
2021-01-20 09:35:00                      CAT STUCK BETWEEN TREE BRANCHES
2021-01-21 13:15:00                              SWAN TRAPPED IN NETTING
2021-01-21 18:23:00                               CAT TRAPPED IN CHIMNEY
2021-01-22 14:22:00                   CAT TRAPPED BETWEEN WALL AND FENCE
2021-01-23 10:18:00                               CAT TRAPPED IN CHIMNEY
2021-01-23 15:43:00                            CAT TRAPPED BETWEEN WALLS
2021-01-23 17:16:00                                             Redacted
2021-01-25 12:02:00             ASSIST RSPCA WITH FOX STUCK DOWN CULVERT
2021-01-26 13:42:00         DOG STUCK IN RAILINGS - CALLER WILL MEET YOU
2021-01-26 18:21:00                                             Redacted
2021-01-26 22:44:00    BIRDS TRAPPED IN BASKETBALL COURT CALLER IS ON...
2021-01-26 23:35:00             FOX TRAPPED IN FENCE IN ALLEYWAY NEXT TO
2021-01-27 09:18:00    CAT STUCK IN TREE - ATTENDED YESTERDAY AND ADV...
2021-01-27 10:12:00    BIRD TRAPPED BY LEG IN A TREE - RSPCA IN ATTEN...
2021-01-27 15:22:00                           CAT UP TREE   ASSIST RSPCA
2021-01-29 10:47:00                 TRAPPED FOX IN FENCE  IN REAR GARDEN
2021-01-30 14:53:00                                 CAT STUCK UNDER SHED
2021-01-30 15:28:00              BIRD CAUGHT IN NETTING - RSPCA ON SCENE
2021-01-30 17:54:00                                DOG TRAPPED UNDER CAR
2021-01-31 12:53:00                   CAT STUCK UP TREE - RSPCA ON SCENE
2021-01-31 13:48:00           INJURED CAT STUCK IN GREEN AREA AT REAR OF
Name: FinalDescription, dtype: object

# resample the timeseries by month and count incidents
df.resample("ME")["IncidentNumber"].count().plot(title="Monthly Calls")
# see https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
plt.show()

# resample by year, sum total costs, average hourly costs
fig, axs = plt.subplots(figsize=(16, 4), ncols=2)
df.resample("YE")["IncidentNotionalCost(£)"].sum().plot(
    title="Year total cost", ax=axs[0]
)
df.resample("YE")["HourlyNotionalCost(£)"].mean().plot(
    title="Average hourly cost", ax=axs[1]
)
plt.show()

# count missing entries for each column
df.isna().sum()

IncidentNumber                   0
CalYear                          0
FinYear                          0
TypeOfIncident                   0
PumpCount                       77
PumpHoursTotal                  79
HourlyNotionalCost(£)            0
IncidentNotionalCost(£)         79
FinalDescription                 5
AnimalGroupParent                0
OriginofCall                     0
PropertyType                     0
PropertyCategory                 0
SpecialServiceTypeCategory       0
SpecialServiceType               0
WardCode                        19
Ward                            19
BoroughCode                     14
Borough                         14
StnGroundName                    0
UPRN                          7265
Street                           0
USRN                          1156
PostcodeDistrict                 0
Easting_m                     6246
Northing_m                    6246
Easting_rounded                  0
Northing_rounded                 0
Latitude                      6246
Longitude                     6246
dtype: int64

# If PumpCount is missing, typically so is PumpHoursTotal
# 66 rows are missing at least one of these
pump_missing = df["PumpCount"].isna() | df["PumpHoursTotal"].isna()
print(pump_missing.sum())

79

# so we could choose to drop these rows
df1 = df.drop(df.loc[pump_missing].index)
# here we made a new dataset df1 with these rows dropped
# to drop the rows from the original dataset df, could do:
#
# df = df.drop(df.loc[pump_missing == True].index)
#
# or:
#
# df.drop(df.loc[pump_missing == True].index, inplace=True)
#
print(len(df1))

11579

# another equivalent way to do this
df2 = df.dropna(subset=["PumpCount", "PumpHoursTotal"])
print(len(df2))

11579

# but if we drop them, we lose valid data from other columns
# let's look at the distribution of values:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
df.plot.hist(y="PumpCount", ax=axs[0])
df.plot.hist(y="PumpHoursTotal", ax=axs[1])
plt.plot()

[]

# looks like it would be better to replace missing PumpCount and PumpHoursTotal fields with 1
df.fillna({"PumpCount": 1, "PumpHoursTotal": 1}, inplace=True)

df.isna().sum()

IncidentNumber                   0
CalYear                          0
FinYear                          0
TypeOfIncident                   0
PumpCount                        0
PumpHoursTotal                   0
HourlyNotionalCost(£)            0
IncidentNotionalCost(£)         79
FinalDescription                 5
AnimalGroupParent                0
OriginofCall                     0
PropertyType                     0
PropertyCategory                 0
SpecialServiceTypeCategory       0
SpecialServiceType               0
WardCode                        19
Ward                            19
BoroughCode                     14
Borough                         14
StnGroundName                    0
UPRN                          7265
Street                           0
USRN                          1156
PostcodeDistrict                 0
Easting_m                     6246
Northing_m                    6246
Easting_rounded                  0
Northing_rounded                 0
Latitude                      6246
Longitude                     6246
dtype: int64

df.nunique().sort_values()

TypeOfIncident                    1
PumpCount                         4
SpecialServiceTypeCategory        4
PropertyCategory                  7
OriginofCall                      8
PumpHoursTotal                   12
HourlyNotionalCost(£)            14
CalYear                          16
FinYear                          17
SpecialServiceType               24
AnimalGroupParent                29
BoroughCode                      37
Borough                          70
IncidentNotionalCost(£)          90
StnGroundName                   109
PropertyType                    194
PostcodeDistrict                283
Northing_rounded                428
Easting_rounded                 533
WardCode                        762
Ward                           1371
UPRN                           4184
Northing_m                     4848
Easting_m                      4949
Latitude                       5341
Longitude                      5341
FinalDescription               7054
USRN                           7629
Street                         8210
IncidentNumber                11658
dtype: int64

# "cat" and "Cat" are treated as different animals here:
df["AnimalGroupParent"].unique()

array(['Dog', 'Fox', 'Horse', 'Rabbit',
       'Unknown - Heavy Livestock Animal', 'Squirrel', 'Cat', 'Bird',
       'Unknown - Domestic Animal Or Pet', 'Sheep', 'Deer',
       'Unknown - Wild Animal', 'Snake', 'Lizard', 'Hedgehog', 'cat',
       'Hamster', 'Lamb', 'Fish', 'Bull', 'Cow', 'Ferret', 'Budgie',
       'Unknown - Animal rescue from water - Farm animal', 'Pigeon',
       'Goat', 'Tortoise',
       'Unknown - Animal rescue from below ground - Farm animal', 'Rat'],
      dtype=object)

# select rows where AnimalGroupParent is "cat", replace with "Cat"
df.loc[df["AnimalGroupParent"] == "cat", "AnimalGroupParent"] = "Cat"

df["AnimalGroupParent"].unique()

array(['Dog', 'Fox', 'Horse', 'Rabbit',
       'Unknown - Heavy Livestock Animal', 'Squirrel', 'Cat', 'Bird',
       'Unknown - Domestic Animal Or Pet', 'Sheep', 'Deer',
       'Unknown - Wild Animal', 'Snake', 'Lizard', 'Hedgehog', 'Hamster',
       'Lamb', 'Fish', 'Bull', 'Cow', 'Ferret', 'Budgie',
       'Unknown - Animal rescue from water - Farm animal', 'Pigeon',
       'Goat', 'Tortoise',
       'Unknown - Animal rescue from below ground - Farm animal', 'Rat'],
      dtype=object)

df.groupby("AnimalGroupParent")["IncidentNumber"].count().sort_values().plot.barh(
    logx=True
)
plt.show()

# apparently different hourly costs
# does it depend on the type of event? or does it just increase over time?
df["HourlyNotionalCost(£)"].unique()

array([255, 260, 290, 295, 298, 326, 328, 333, 339, 346, 352, 364, 388,
       430])

# just goes up over time
df["HourlyNotionalCost(£)"].plot.line()

<Axes: xlabel='DateTimeOfCall'>

# Group incidents by fire station & count them
df.groupby("StnGroundName")["IncidentNumber"].count()

StnGroundName
Acton           93
Addington       86
Barking        108
Barnet         108
Battersea      100
              ... 
Whitechapel     39
Willesden       87
Wimbledon       95
Woodford       109
Woodside        98
Name: IncidentNumber, Length: 109, dtype: int64

import geopandas

# drop missing longitude/latitude
df2 = df.dropna(subset=["Longitude", "Latitude"])
# also drop zero values
df2 = df2[df2["Latitude"] != 0]

# set crs to EPSG:4326 to specify WGS84 Latitude/Longitude
gdf = geopandas.GeoDataFrame(
    df2,
    geometry=geopandas.points_from_xy(df2["Longitude"], df2["Latitude"]),
    crs="EPSG:4326",
)

gdf.head()

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color="black", alpha=0.3)
plt.title("Call locations")
# plt.axis("off")
plt.show()

import contextily as cx

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color="black", alpha=0.3)
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title("Call locations")
plt.axis("off")
plt.show()

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
for animal, colour in [
    ("Cow", "black"),
    ("Deer", "red"),
    ("Fox", "blue"),
    ("Snake", "yellow"),
]:
    gdf[gdf["AnimalGroupParent"] == animal].plot(
        ax=ax, color=colour, alpha=0.5, label=animal
    )
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title("Call locations by animal")
plt.legend()
plt.axis("off")
plt.show()

	IncidentNumber	DateTimeOfCall	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	...	UPRN	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude
0	139091	2009-01-01 03:01:00	2009	2008/09	Special Service	1.0	2.0	255	510.0	Redacted	...	NaN	Waddington Way	20500146.0	SE19	NaN	NaN	532350	170050	NaN	NaN
1	275091	2009-01-01 08:51:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167
2	2075091	2009-01-04 10:07:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985
3	2872091	2009-01-05 12:27:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	1.000215e+11	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684
4	3553091	2009-01-06 15:23:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Swindon Lane	21300122.0	RM3	NaN	NaN	554650	192350	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11653	220675-29122024	2024-12-29 11:34:00	2024	2024/25	Special Service	1.0	1.0	430	430.0	CAT TRAPPED BEHIND WHEEL OF CAR	...	1.000213e+11	FONTWELL CLOSE	21201469.0	HA3	515218.0	191476.0	515250	191450	51.610342	-0.337450
11654	220734-29122024	2024-12-29 13:46:00	2024	2024/25	Special Service	1.0	1.0	430	430.0	CAT TRAPPED IN FLOOR PANEL IN CEILING OF LIFT ...	...	NaN	ALTASH WAY	20800055.0	SE9	NaN	NaN	543150	172550	NaN	NaN
11655	221162-30122024	2024-12-30 10:53:00	2024	2024/25	Special Service	1.0	1.0	430	430.0	TWO CATS TRAPPED BEHIND WALL	...	NaN	COLLINGHAM PLACE	21700113.0	SW5	NaN	NaN	525750	178850	NaN	NaN
11656	221334-30122024	2024-12-30 15:44:00	2024	2024/25	Special Service	1.0	1.0	430	430.0	Redacted	...	2.000012e+11	BRIGHTON ROAD	20501946.0	CR5	529718.0	158834.0	529750	158850	51.313841	-0.140121
11657	221970-31122024	2024-12-31 17:37:00	2024	2024/25	Special Service	1.0	1.0	430	430.0	Redacted	...	NaN	DALE VIEW CRESCENT	22830850.0	E4	NaN	NaN	538350	193450	NaN	NaN

	IncidentNumber	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	AnimalGroupParent	...	UPRN	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude
DateTimeOfCall
2009-01-01 03:01:00	139091	2009	2008/09	Special Service	1.0	2.0	255	510.0	Redacted	Dog	...	NaN	Waddington Way	20500146.0	SE19	NaN	NaN	532350	170050	NaN	NaN
2009-01-01 08:51:00	275091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Fox	...	NaN	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167
2009-01-04 10:07:00	2075091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	NaN	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985
2009-01-05 12:27:00	2872091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Horse	...	1.000215e+11	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684
2009-01-06 15:23:00	3553091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Rabbit	...	NaN	Swindon Lane	21300122.0	RM3	NaN	NaN	554650	192350	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2024-12-29 11:34:00	220675-29122024	2024	2024/25	Special Service	1.0	1.0	430	430.0	CAT TRAPPED BEHIND WHEEL OF CAR	Cat	...	1.000213e+11	FONTWELL CLOSE	21201469.0	HA3	515218.0	191476.0	515250	191450	51.610342	-0.337450
2024-12-29 13:46:00	220734-29122024	2024	2024/25	Special Service	1.0	1.0	430	430.0	CAT TRAPPED IN FLOOR PANEL IN CEILING OF LIFT ...	Cat	...	NaN	ALTASH WAY	20800055.0	SE9	NaN	NaN	543150	172550	NaN	NaN
2024-12-30 10:53:00	221162-30122024	2024	2024/25	Special Service	1.0	1.0	430	430.0	TWO CATS TRAPPED BEHIND WALL	cat	...	NaN	COLLINGHAM PLACE	21700113.0	SW5	NaN	NaN	525750	178850	NaN	NaN
2024-12-30 15:44:00	221334-30122024	2024	2024/25	Special Service	1.0	1.0	430	430.0	Redacted	Cat	...	2.000012e+11	BRIGHTON ROAD	20501946.0	CR5	529718.0	158834.0	529750	158850	51.313841	-0.140121
2024-12-31 17:37:00	221970-31122024	2024	2024/25	Special Service	1.0	1.0	430	430.0	Redacted	Cat	...	NaN	DALE VIEW CRESCENT	22830850.0	E4	NaN	NaN	538350	193450	NaN	NaN

	IncidentNumber	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	AnimalGroupParent	...	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude	geometry
DateTimeOfCall
2009-01-01 08:51:00	275091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Fox	...	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167	POINT (-0.06417 51.39095)
2009-01-04 10:07:00	2075091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985	POINT (-0.16199 51.36894)
2009-01-05 12:27:00	2872091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Horse	...	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684	POINT (-0.48968 51.60528)
2009-01-07 06:29:00	4011091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Holloway Road	NaN	E11	539013.0	186162.0	539050	186150	51.557221	0.003880	POINT (0.00388 51.55722)
2009-01-07 11:55:00	4211091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Aldersbrook Road	NaN	E12	541327.0	186654.0	541350	186650	51.561067	0.037434	POINT (0.03743 51.56107)

Data Exploration with Python and Jupyter - part 3¶

Let's download some real data¶

Display the DataFrame¶

Column data types¶

DateTimeOfCall¶

Use datetime as the index¶

Missing data¶

Count the unique entries in each column¶

Plot location of calls on a map¶

Suggested workflow / philosophy¶

1. you want to do something but not sure how¶

Suggested workflow / philosophy¶

2. you try something and get an error message¶

Suggested workflow / philosophy¶

3. look for a stackoverflow answer with many up-votes¶

Next steps¶