# import the Pandas library & matplotlib for plotting

import pandas as pd
import matplotlib.pyplot as plt

# download a csv file with some data and convert it to a DataFrame
url = "https://data.london.gov.uk/download/animal-rescue-incidents-attended-by-lfb/01007433-55c2-4b8a-b799-626d9e3bc284/Animal%20Rescue%20incidents%20attended%20by%20LFB%20from%20Jan%202009.csv"
df = pd.read_csv(url)

df

df.dtypes

IncidentNumber                 object
DateTimeOfCall                 object
CalYear                         int64
FinYear                        object
TypeOfIncident                 object
PumpCount                     float64
PumpHoursTotal                float64
HourlyNotionalCost(£)           int64
IncidentNotionalCost(£)       float64
FinalDescription               object
AnimalGroupParent              object
OriginofCall                   object
PropertyType                   object
PropertyCategory               object
SpecialServiceTypeCategory     object
SpecialServiceType             object
WardCode                       object
Ward                           object
BoroughCode                    object
Borough                        object
StnGroundName                  object
UPRN                          float64
Street                         object
USRN                          float64
PostcodeDistrict               object
Easting_m                     float64
Northing_m                    float64
Easting_rounded                 int64
Northing_rounded                int64
Latitude                      float64
Longitude                     float64
dtype: object

df["DateTimeOfCall"].head()

0    2009-01-01 03:01:00
1    2009-01-01 08:51:00
2    2009-01-04 10:07:00
3    2009-01-05 12:27:00
4    2009-01-06 15:23:00
Name: DateTimeOfCall, dtype: object

# this looks like what we want..
pd.to_datetime(df["DateTimeOfCall"]).head()

0   2009-01-01 03:01:00
1   2009-01-01 08:51:00
2   2009-01-04 10:07:00
3   2009-01-05 12:27:00
4   2009-01-06 15:23:00
Name: DateTimeOfCall, dtype: datetime64[ns]

# ..but which number is the month and which is the day?
# how can we check if what we just did was correct?
pd.to_datetime(df["DateTimeOfCall"]).plot()
# should be a single monotonically increasing line: looks good!

<Axes: >

# replace DateTimeOfCall column in dataframe with this one
df["DateTimeOfCall"] = pd.to_datetime(df["DateTimeOfCall"])

df.set_index("DateTimeOfCall", inplace=True)

df

# can now use datetime to select rows: here is jan 2021
df.loc["2021-01-01":"2021-01-31", "FinalDescription"]

DateTimeOfCall
2021-01-01 12:09:00        KITTEN STUCK UP TREE  AL REQUESTED FROM SCENE
2021-01-01 14:06:00                                             Redacted
2021-01-03 18:40:00                CAT WITH LEG TRAPPED IN BATH PLUGHOLE
2021-01-04 13:39:00                                             Redacted
2021-01-06 10:22:00                                             Redacted
2021-01-06 13:09:00    CAT IN DISTRESS ON ROOF - ADDITIONAL APPLIANCE...
2021-01-06 20:35:00        DOG TRAPPED IN FOX HOLE  - MEET AT CLUB HOUSE
2021-01-07 23:50:00                   KITTEN STUCK BETWEEN WALL AND ROOF
2021-01-09 08:01:00                                  DOG STUCK IN TRENCH
2021-01-10 19:27:00                                             Redacted
2021-01-12 11:39:00                                             Redacted
2021-01-12 22:38:00                                 CAT TRAPPED IN DITCH
2021-01-16 18:05:00                          DOG TRAPPED IN PORTER CABIN
2021-01-17 16:09:00    DOG TRAPPED IN WAREHOUSE AREA - CALLER BELIEVE...
2021-01-17 17:09:00      BIRD TRAPPED IN NETTING    CALLER WILL MEET YOU
2021-01-18 15:17:00            CAT STUCK IN TREE BEING ATTACKED BY CROWS
2021-01-18 17:06:00    ASSIST RSPCA - SMALL ANIMAL RESUE - BIRD ENTAN...
2021-01-19 18:28:00                          CAT TRAPPED BEHIND CUPBOARD
2021-01-19 20:24:00                                             Redacted
2021-01-19 20:36:00                              RUNNING CALL AT ON ROOF
2021-01-20 09:35:00                      CAT STUCK BETWEEN TREE BRANCHES
2021-01-21 13:15:00                              SWAN TRAPPED IN NETTING
2021-01-21 18:23:00                               CAT TRAPPED IN CHIMNEY
2021-01-22 14:22:00                   CAT TRAPPED BETWEEN WALL AND FENCE
2021-01-23 10:18:00                               CAT TRAPPED IN CHIMNEY
2021-01-23 15:43:00                            CAT TRAPPED BETWEEN WALLS
2021-01-23 17:16:00                                             Redacted
2021-01-25 12:02:00             ASSIST RSPCA WITH FOX STUCK DOWN CULVERT
2021-01-26 13:42:00         DOG STUCK IN RAILINGS - CALLER WILL MEET YOU
2021-01-26 18:21:00                                             Redacted
2021-01-26 22:44:00    BIRDS TRAPPED IN BASKETBALL COURT CALLER IS ON...
2021-01-26 23:35:00             FOX TRAPPED IN FENCE IN ALLEYWAY NEXT TO
2021-01-27 09:18:00    CAT STUCK IN TREE - ATTENDED YESTERDAY AND ADV...
2021-01-27 10:12:00    BIRD TRAPPED BY LEG IN A TREE - RSPCA IN ATTEN...
2021-01-27 15:22:00                           CAT UP TREE   ASSIST RSPCA
2021-01-29 10:47:00                 TRAPPED FOX IN FENCE  IN REAR GARDEN
2021-01-30 14:53:00                                 CAT STUCK UNDER SHED
2021-01-30 15:28:00              BIRD CAUGHT IN NETTING - RSPCA ON SCENE
2021-01-30 17:54:00                                DOG TRAPPED UNDER CAR
2021-01-31 12:53:00                   CAT STUCK UP TREE - RSPCA ON SCENE
2021-01-31 13:48:00           INJURED CAT STUCK IN GREEN AREA AT REAR OF
Name: FinalDescription, dtype: object

# resample the timeseries by month and count incidents
df.resample("M")["IncidentNumber"].count().plot(title="Monthly Calls")
# see https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
plt.show()

# resample by year, sum total costs, average hourly costs
fig, axs = plt.subplots(figsize=(16, 4), ncols=2)
df.resample("Y")["IncidentNotionalCost(£)"].sum().plot(
    title="Year total cost", ax=axs[0]
)
df.resample("Y")["HourlyNotionalCost(£)"].mean().plot(
    title="Average hourly cost", ax=axs[1]
)
plt.show()

# count missing entries for each column
df.isna().sum()

IncidentNumber                   0
CalYear                          0
FinYear                          0
TypeOfIncident                   0
PumpCount                       65
PumpHoursTotal                  66
HourlyNotionalCost(£)            0
IncidentNotionalCost(£)         66
FinalDescription                 5
AnimalGroupParent                0
OriginofCall                     0
PropertyType                     0
PropertyCategory                 0
SpecialServiceTypeCategory       0
SpecialServiceType               0
WardCode                        10
Ward                            10
BoroughCode                     12
Borough                         12
StnGroundName                    0
UPRN                          6127
Street                           0
USRN                          1156
PostcodeDistrict                 0
Easting_m                     5108
Northing_m                    5108
Easting_rounded                  0
Northing_rounded                 0
Latitude                      5108
Longitude                     5108
dtype: int64

# If PumpCount is missing, typically so is PumpHoursTotal
# 66 rows are missing at least one of these
pump_missing = df["PumpCount"].isna() | df["PumpHoursTotal"].isna()
print(pump_missing.sum())

66

# so we could choose to drop these rows
df1 = df.drop(df.loc[pump_missing == True].index)
# here we made a new dataset df1 with these rows dropped
# to drop the rows from the original dataset df, could do:
#
# df = df.drop(df.loc[pump_missing == True].index)
#
# or:
#
# df.drop(df.loc[pump_missing == True].index, inplace=True)
#
print(len(df1))

9662

# another equivalent way to do this
df2 = df.dropna(subset=["PumpCount", "PumpHoursTotal"])
print(len(df2))

9662

# but if we drop them, we lose valid data from other columns
# let's look at the distribution of values:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
df.plot.hist(y="PumpCount", ax=axs[0])
df.plot.hist(y="PumpHoursTotal", ax=axs[1])
plt.plot()

[]

# looks like it would be better to replace missing PumpCount and PumpHoursTotal fields with 1
?df.fillna
df.fillna({"PumpCount": 1, "PumpHoursTotal": 1}, inplace=True)

df.isna().sum()

IncidentNumber                   0
CalYear                          0
FinYear                          0
TypeOfIncident                   0
PumpCount                        0
PumpHoursTotal                   0
HourlyNotionalCost(£)            0
IncidentNotionalCost(£)         66
FinalDescription                 5
AnimalGroupParent                0
OriginofCall                     0
PropertyType                     0
PropertyCategory                 0
SpecialServiceTypeCategory       0
SpecialServiceType               0
WardCode                        10
Ward                            10
BoroughCode                     12
Borough                         12
StnGroundName                    0
UPRN                          6127
Street                           0
USRN                          1156
PostcodeDistrict                 0
Easting_m                     5108
Northing_m                    5108
Easting_rounded                  0
Northing_rounded                 0
Latitude                      5108
Longitude                     5108
dtype: int64

df.nunique().sort_values()

TypeOfIncident                   1
PumpCount                        4
SpecialServiceTypeCategory       4
PropertyCategory                 7
OriginofCall                     8
PumpHoursTotal                  12
HourlyNotionalCost(£)           13
CalYear                         15
FinYear                         16
SpecialServiceType              24
AnimalGroupParent               28
BoroughCode                     37
Borough                         70
IncidentNotionalCost(£)         82
StnGroundName                  108
PropertyType                   187
PostcodeDistrict               277
Northing_rounded               425
Easting_rounded                530
WardCode                       759
Ward                          1272
UPRN                          3446
Northing_m                    4188
Easting_m                     4254
Longitude                     4549
Latitude                      4549
FinalDescription              5907
USRN                          6496
Street                        7172
IncidentNumber                9728
dtype: int64

# "cat" and "Cat" are treated as different animals here:
df["AnimalGroupParent"].unique()

array(['Dog', 'Fox', 'Horse', 'Rabbit',
       'Unknown - Heavy Livestock Animal', 'Squirrel', 'Cat', 'Bird',
       'Unknown - Domestic Animal Or Pet', 'Sheep', 'Deer',
       'Unknown - Wild Animal', 'Snake', 'Lizard', 'Hedgehog', 'cat',
       'Hamster', 'Lamb', 'Fish', 'Bull', 'Cow', 'Ferret', 'Budgie',
       'Unknown - Animal rescue from water - Farm animal', 'Pigeon',
       'Goat', 'Tortoise',
       'Unknown - Animal rescue from below ground - Farm animal'],
      dtype=object)

# select rows where AnimalGroupParent is "cat", replace with "Cat"
df.loc[df["AnimalGroupParent"] == "cat", "AnimalGroupParent"] = "Cat"

df["AnimalGroupParent"].unique()

array(['Dog', 'Fox', 'Horse', 'Rabbit',
       'Unknown - Heavy Livestock Animal', 'Squirrel', 'Cat', 'Bird',
       'Unknown - Domestic Animal Or Pet', 'Sheep', 'Deer',
       'Unknown - Wild Animal', 'Snake', 'Lizard', 'Hedgehog', 'Hamster',
       'Lamb', 'Fish', 'Bull', 'Cow', 'Ferret', 'Budgie',
       'Unknown - Animal rescue from water - Farm animal', 'Pigeon',
       'Goat', 'Tortoise',
       'Unknown - Animal rescue from below ground - Farm animal'],
      dtype=object)

df.groupby("AnimalGroupParent")["IncidentNumber"].count().sort_values().plot.barh(
    logx=True
)
plt.show()

# apparently different hourly costs
# does it depend on the type of event? or does it just increase over time?
df["HourlyNotionalCost(£)"].unique()

array([255, 260, 290, 295, 298, 326, 328, 333, 339, 346, 352, 364, 388])

# just goes up over time
df["HourlyNotionalCost(£)"].plot.line()

<Axes: xlabel='DateTimeOfCall'>

# Group incidents by fire station & count them
df.groupby("StnGroundName")["IncidentNumber"].count()

StnGroundName
Acton          74
Addington      66
Barking        91
Barnet         95
Battersea      82
               ..
Whitechapel    26
Willesden      68
Wimbledon      75
Woodford       95
Woodside       83
Name: IncidentNumber, Length: 108, dtype: int64

# drop missing longitude/latitude
df2 = df.dropna(subset=["Longitude", "Latitude"])
# also drop zero values
df2 = df2[df2["Latitude"] != 0]
# convert to geodataframe using geopandas
import geopandas

# set crs to EPSG:4326 to specify WGS84 Latitude/Longitude
gdf = geopandas.GeoDataFrame(
    df2,
    geometry=geopandas.points_from_xy(df2["Longitude"], df2["Latitude"]),
    crs="EPSG:4326",
)
gdf.head()

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color="black", alpha=0.3)
plt.title("Call locations")
# plt.axis("off")
plt.show()

import contextily as cx

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color="black", alpha=0.3)
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title("Call locations")
plt.axis("off")
plt.show()

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
for animal, colour in [
    ("Cow", "black"),
    ("Deer", "red"),
    ("Fox", "blue"),
    ("Snake", "yellow"),
]:
    gdf[gdf["AnimalGroupParent"] == animal].plot(
        ax=ax, color=colour, alpha=0.5, label=animal
    )
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title("Call locations by animal")
plt.legend()
plt.axis("off")
plt.show()

	IncidentNumber	DateTimeOfCall	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	...	UPRN	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude
0	139091	2009-01-01 03:01:00	2009	2008/09	Special Service	1.0	2.0	255	510.0	Redacted	...	NaN	Waddington Way	20500146.0	SE19	NaN	NaN	532350	170050	NaN	NaN
1	275091	2009-01-01 08:51:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167
2	2075091	2009-01-04 10:07:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985
3	2872091	2009-01-05 12:27:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	1.000215e+11	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684
4	3553091	2009-01-06 15:23:00	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	...	NaN	Swindon Lane	21300122.0	RM3	NaN	NaN	554650	192350	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9723	096744-30062023	2023-06-30 15:56:00	2023	2023/24	Special Service	1.0	1.0	388	388.0	Redacted	...	1.000207e+11	SELSDON PARK ROAD	20502647.0	CR2	536269.0	162904.0	536250	162950	51.348885	-0.044629
9724	096880-30062023	2023-06-30 20:21:00	2023	2023/24	Special Service	1.0	1.0	388	388.0	PIGEON STUCK BETWEEN FENCES IN PLAYING FIE...	...	2.070058e+08	CAMBRIDGE GARDENS	20702560.0	EN1	534305.0	197280.0	534350	197250	51.658273	-0.059731
9725	096884-30062023	2023-06-30 20:31:00	2023	2023/24	Special Service	1.0	1.0	388	388.0	CAT STUCK BETWEEN BUILT IN FRIDGE AND WALL	...	NaN	EASTFIELD ROAD	20702215.0	EN3	NaN	NaN	535650	198250	NaN	NaN
9726	096913-30062023	2023-06-30 21:24:00	2023	2023/24	Special Service	1.0	2.0	388	776.0	Redacted	...	NaN	NORBURY COURT ROAD	20501229.0	SW16	NaN	NaN	530650	169150	NaN	NaN
9727	096935-30062023	2023-06-30 22:26:00	2023	2023/24	Special Service	1.0	1.0	388	388.0	Redacted	...	NaN	QUEENSHURST SQUARE	21880473.0	KT2	NaN	NaN	518150	169750	NaN	NaN

	IncidentNumber	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	AnimalGroupParent	...	UPRN	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude
DateTimeOfCall
2009-01-01 03:01:00	139091	2009	2008/09	Special Service	1.0	2.0	255	510.0	Redacted	Dog	...	NaN	Waddington Way	20500146.0	SE19	NaN	NaN	532350	170050	NaN	NaN
2009-01-01 08:51:00	275091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Fox	...	NaN	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167
2009-01-04 10:07:00	2075091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	NaN	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985
2009-01-05 12:27:00	2872091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Horse	...	1.000215e+11	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684
2009-01-06 15:23:00	3553091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Rabbit	...	NaN	Swindon Lane	21300122.0	RM3	NaN	NaN	554650	192350	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2023-06-30 15:56:00	096744-30062023	2023	2023/24	Special Service	1.0	1.0	388	388.0	Redacted	Cat	...	1.000207e+11	SELSDON PARK ROAD	20502647.0	CR2	536269.0	162904.0	536250	162950	51.348885	-0.044629
2023-06-30 20:21:00	096880-30062023	2023	2023/24	Special Service	1.0	1.0	388	388.0	PIGEON STUCK BETWEEN FENCES IN PLAYING FIE...	Bird	...	2.070058e+08	CAMBRIDGE GARDENS	20702560.0	EN1	534305.0	197280.0	534350	197250	51.658273	-0.059731
2023-06-30 20:31:00	096884-30062023	2023	2023/24	Special Service	1.0	1.0	388	388.0	CAT STUCK BETWEEN BUILT IN FRIDGE AND WALL	Cat	...	NaN	EASTFIELD ROAD	20702215.0	EN3	NaN	NaN	535650	198250	NaN	NaN
2023-06-30 21:24:00	096913-30062023	2023	2023/24	Special Service	1.0	2.0	388	776.0	Redacted	Cat	...	NaN	NORBURY COURT ROAD	20501229.0	SW16	NaN	NaN	530650	169150	NaN	NaN
2023-06-30 22:26:00	096935-30062023	2023	2023/24	Special Service	1.0	1.0	388	388.0	Redacted	Cat	...	NaN	QUEENSHURST SQUARE	21880473.0	KT2	NaN	NaN	518150	169750	NaN	NaN

	IncidentNumber	CalYear	FinYear	TypeOfIncident	PumpCount	PumpHoursTotal	HourlyNotionalCost(£)	IncidentNotionalCost(£)	FinalDescription	AnimalGroupParent	...	Street	USRN	PostcodeDistrict	Easting_m	Northing_m	Easting_rounded	Northing_rounded	Latitude	Longitude	geometry
DateTimeOfCall
2009-01-01 08:51:00	275091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Fox	...	Grasmere Road	NaN	SE25	534785.0	167546.0	534750	167550	51.390954	-0.064167	POINT (-0.06417 51.39095)
2009-01-04 10:07:00	2075091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Mill Lane	NaN	SM5	528041.0	164923.0	528050	164950	51.368941	-0.161985	POINT (-0.16199 51.36894)
2009-01-05 12:27:00	2872091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Horse	...	Park Lane	21401484.0	UB9	504689.0	190685.0	504650	190650	51.605283	-0.489684	POINT (-0.48968 51.60528)
2009-01-07 06:29:00	4011091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Holloway Road	NaN	E11	539013.0	186162.0	539050	186150	51.557221	0.003880	POINT (0.00388 51.55722)
2009-01-07 11:55:00	4211091	2009	2008/09	Special Service	1.0	1.0	255	255.0	Redacted	Dog	...	Aldersbrook Road	NaN	E12	541327.0	186654.0	541350	186650	51.561067	0.037434	POINT (0.03743 51.56107)

Data Exploration with Python and Jupyter¶

Let's download some real data¶

Suggested workflow / philosophy¶

Display the DataFrame¶

Column data types¶

Convert DateTimeOfCall to a date-time¶

Use the datetime as the index¶

Missing data¶

Count the unique entries in each column¶

Plot location of calls on a map¶

Next steps¶