Inserting data into the database and estimating execution time¶
In [ ]:
Copied!
from onehealth_db import postgresql_database as db
from pathlib import Path
import time
import xarray as xr
import pandas as pd
import os
import dotenv
from onehealth_db import postgresql_database as db
from pathlib import Path
import time
import xarray as xr
import pandas as pd
import os
import dotenv
Set up necessary variables¶
In [ ]:
Copied!
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
# or provide a .env file with the DB_URL variable
dotenv.load_dotenv()
db_url = os.getenv("DB_URL")
# initialize the database
engine = db.initialize_database(db_url, replace=True)
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
# or provide a .env file with the DB_URL variable
dotenv.load_dotenv()
db_url = os.getenv("DB_URL")
# initialize the database
engine = db.initialize_database(db_url, replace=True)
In [ ]:
Copied!
# record running time
run_time = {}
# variable types
var_types = [
{"name": "t2m", "unit": "Celsius", "description": "2m temperature"},
{"name": "tp", "unit": "mm", "description": "Total precipitation"},
{"name": "total-population", "unit": "1", "description": "Total population"},
{
"name": "R0",
"unit": "1",
"description": "West Nile virus transmission suitability",
},
]
# record running time
run_time = {}
# variable types
var_types = [
{"name": "t2m", "unit": "Celsius", "description": "2m temperature"},
{"name": "tp", "unit": "mm", "description": "Total precipitation"},
{"name": "total-population", "unit": "1", "description": "Total population"},
{
"name": "R0",
"unit": "1",
"description": "West Nile virus transmission suitability",
},
]
Add data into the database¶
In [ ]:
Copied!
# start recording time
t0 = time.time()
# start recording time
t0 = time.time()
In [ ]:
Copied!
# paths to data
data_path = Path("../../../data")
shapefile_path = data_path / "in" / "NUTS_RG_20M_2024_4326.shp"
era5_land_path = (
data_path
/ "in"
/ "era5_data_2016_2017_all_2t_tp_monthly_unicoords_adjlon_celsius_mm_05deg_trim.nc"
)
isimip_path = (
data_path / "in" / "population_histsoc_30arcmin_annual_2016_2017_renamed.nc"
)
transmission_suitability_WNV_path = data_path / "in" / "output_jmodel_europe.nc"
# paths to data
data_path = Path("../../../data")
shapefile_path = data_path / "in" / "NUTS_RG_20M_2024_4326.shp"
era5_land_path = (
data_path
/ "in"
/ "era5_data_2016_2017_all_2t_tp_monthly_unicoords_adjlon_celsius_mm_05deg_trim.nc"
)
isimip_path = (
data_path / "in" / "population_histsoc_30arcmin_annual_2016_2017_renamed.nc"
)
transmission_suitability_WNV_path = data_path / "in" / "output_jmodel_europe.nc"
In [ ]:
Copied!
# add NUTS definition data
db.insert_nuts_def(engine, shapefile_path)
t_nuts_def = time.time()
# add NUTS definition data
db.insert_nuts_def(engine, shapefile_path)
t_nuts_def = time.time()
In [ ]:
Copied!
# add variable types
var_type_session = db.create_session(engine)
db.insert_var_types(var_type_session, var_types)
var_type_session.close()
t_var_type = time.time()
# add variable types
var_type_session = db.create_session(engine)
db.insert_var_types(var_type_session, var_types)
var_type_session.close()
t_var_type = time.time()
In [ ]:
Copied!
era5_ds = xr.open_dataset(era5_land_path, chunks={})
isimip_ds = xr.open_dataset(isimip_path, chunks={})
R0_ds = xr.open_dataset(transmission_suitability_WNV_path, chunks={})
# rechunk the dataset
era5_ds = era5_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
isimip_ds = isimip_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
R0_ds = R0_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
# add grid points
grid_point_session = db.create_session(engine)
db.insert_grid_points(
grid_point_session,
latitudes=era5_ds.latitude.to_numpy(),
longitudes=era5_ds.longitude.to_numpy(),
)
grid_point_session.close()
t_grid_point = time.time()
era5_ds = xr.open_dataset(era5_land_path, chunks={})
isimip_ds = xr.open_dataset(isimip_path, chunks={})
R0_ds = xr.open_dataset(transmission_suitability_WNV_path, chunks={})
# rechunk the dataset
era5_ds = era5_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
isimip_ds = isimip_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
R0_ds = R0_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
# add grid points
grid_point_session = db.create_session(engine)
db.insert_grid_points(
grid_point_session,
latitudes=era5_ds.latitude.to_numpy(),
longitudes=era5_ds.longitude.to_numpy(),
)
grid_point_session.close()
t_grid_point = time.time()
In [ ]:
Copied!
# add time points
time_point_session = db.create_session(engine)
db.insert_time_points(
time_point_session,
time_point_data=[
(era5_ds.time.to_numpy(), False),
(isimip_ds.time.to_numpy(), True),
],
) # True means yearly data
time_point_session.close()
t_time_point = time.time()
# add time points
time_point_session = db.create_session(engine)
db.insert_time_points(
time_point_session,
time_point_data=[
(era5_ds.time.to_numpy(), False),
(isimip_ds.time.to_numpy(), True),
],
) # True means yearly data
time_point_session.close()
t_time_point = time.time()
In [ ]:
Copied!
# get id maps for grid, time, and variable types
id_map_session = db.create_session(engine)
grid_id_map, time_id_map, var_type_id_map = db.get_id_maps(id_map_session)
id_map_session.close()
t_get_id_map = time.time()
# get id maps for grid, time, and variable types
id_map_session = db.create_session(engine)
grid_id_map, time_id_map, var_type_id_map = db.get_id_maps(id_map_session)
id_map_session.close()
t_get_id_map = time.time()
In [ ]:
Copied!
# add t2m values
_, t_start_insert_t2m = db.insert_var_values(
engine, era5_ds, "t2m", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_t2m = time.time()
# add t2m values
_, t_start_insert_t2m = db.insert_var_values(
engine, era5_ds, "t2m", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_t2m = time.time()
In [ ]:
Copied!
# add total precipitation values
_, t_start_insert_tp = db.insert_var_values(
engine, era5_ds, "tp", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_tp = time.time()
# add total precipitation values
_, t_start_insert_tp = db.insert_var_values(
engine, era5_ds, "tp", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_tp = time.time()
In [ ]:
Copied!
# add population data
t_yearly_to_monthly, t_start_insert_popu = db.insert_var_values(
engine,
isimip_ds,
"total-population",
grid_id_map,
time_id_map,
var_type_id_map,
to_monthly=False,
)
t_inserted_popu = time.time()
# add population data
t_yearly_to_monthly, t_start_insert_popu = db.insert_var_values(
engine,
isimip_ds,
"total-population",
grid_id_map,
time_id_map,
var_type_id_map,
to_monthly=False,
)
t_inserted_popu = time.time()
In [ ]:
Copied!
# add R0 values
_, t_start_insert_R0 = db.insert_var_values(
engine, R0_ds, "R0", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_R0 = time.time()
# add R0 values
_, t_start_insert_R0 = db.insert_var_values(
engine, R0_ds, "R0", grid_id_map, time_id_map, var_type_id_map
)
t_inserted_R0 = time.time()
In [ ]:
Copied!
t_end = time.time()
t_end = time.time()
In [ ]:
Copied!
# calculate execution time
run_time["nuts_def"] = t_nuts_def - t0
run_time["var_type"] = t_var_type - t_nuts_def
run_time["grid_point"] = t_grid_point - t_var_type
run_time["time_point"] = t_time_point - t_grid_point
run_time["get_id_map"] = t_get_id_map - t_time_point
run_time["prepare_insert_t2m"] = t_start_insert_t2m - t_get_id_map
run_time["inserted_t2m"] = t_inserted_t2m - t_start_insert_t2m
run_time["prepare_insert_tp"] = t_start_insert_tp - t_inserted_t2m
run_time["inserted_tp"] = t_inserted_tp - t_start_insert_tp
run_time["popu_yearly_to_monthly"] = t_yearly_to_monthly - t_inserted_tp
run_time["prepare_insert_popu"] = t_start_insert_popu - t_yearly_to_monthly
run_time["inserted_popu"] = t_inserted_popu - t_start_insert_popu
run_time["prepare_insert_R0"] = t_start_insert_R0 - t_inserted_R0
run_time["inserted_R0"] = t_inserted_R0 - t_start_insert_R0
total_time = t_end - t0
# calculate execution time
run_time["nuts_def"] = t_nuts_def - t0
run_time["var_type"] = t_var_type - t_nuts_def
run_time["grid_point"] = t_grid_point - t_var_type
run_time["time_point"] = t_time_point - t_grid_point
run_time["get_id_map"] = t_get_id_map - t_time_point
run_time["prepare_insert_t2m"] = t_start_insert_t2m - t_get_id_map
run_time["inserted_t2m"] = t_inserted_t2m - t_start_insert_t2m
run_time["prepare_insert_tp"] = t_start_insert_tp - t_inserted_t2m
run_time["inserted_tp"] = t_inserted_tp - t_start_insert_tp
run_time["popu_yearly_to_monthly"] = t_yearly_to_monthly - t_inserted_tp
run_time["prepare_insert_popu"] = t_start_insert_popu - t_yearly_to_monthly
run_time["inserted_popu"] = t_inserted_popu - t_start_insert_popu
run_time["prepare_insert_R0"] = t_start_insert_R0 - t_inserted_R0
run_time["inserted_R0"] = t_inserted_R0 - t_start_insert_R0
total_time = t_end - t0
In [ ]:
Copied!
runtime_df = pd.DataFrame(run_time, index=[0])
# convert seconds to minutes
runtime_df = (runtime_df / 60).round(4)
total_time = round(total_time / 60, 4)
runtime_df.to_csv(
data_path / "postgres_runtime_{}.csv".format(time.strftime("%Y-%m-%d")), index=False
)
runtime_df.plot.bar()
runtime_df = pd.DataFrame(run_time, index=[0])
# convert seconds to minutes
runtime_df = (runtime_df / 60).round(4)
total_time = round(total_time / 60, 4)
runtime_df.to_csv(
data_path / "postgres_runtime_{}.csv".format(time.strftime("%Y-%m-%d")), index=False
)
runtime_df.plot.bar()
In [ ]:
Copied!
unit = "minutes"
print(f"NUTS definition data inserted in {runtime_df.loc[0, 'nuts_def']} {unit}.")
print(f"Variable types inserted in {runtime_df.loc[0, 'var_type']} {unit}.")
print(f"Grid points inserted in {runtime_df.loc[0, 'grid_point']} {unit}.")
print(f"Time points inserted in {runtime_df.loc[0, 'time_point']} {unit}.")
print(f"ID maps retrieved in {runtime_df.loc[0, 'get_id_map']} {unit}.")
print(
f"t2m variable values prepared in {runtime_df.loc[0, 'prepare_insert_t2m']} {unit}."
)
print(f"t2m variable values inserted in {runtime_df.loc[0, 'inserted_t2m']} {unit}.")
print(
f"tp variable values prepared in {runtime_df.loc[0, 'prepare_insert_tp']} {unit}."
)
print(f"tp variable values inserted in {runtime_df.loc[0, 'inserted_tp']} {unit}.")
print(
f"Population data converted from yearly to monthly in {runtime_df.loc[0, 'popu_yearly_to_monthly']} {unit}."
)
print(
f"Population variable values prepared in {runtime_df.loc[0, 'prepare_insert_popu']} {unit}."
)
print(
f"Population variable values inserted in {runtime_df.loc[0, 'inserted_popu']} {unit}."
)
print(
f"R0 variable values prepared in {runtime_df.loc[0, 'prepare_insert_R0']} {unit}."
)
print(f"R0 variable values inserted in {runtime_df.loc[0, 'inserted_R0']} {unit}.")
print(f"Total execution time: {total_time} {unit}.")
unit = "minutes"
print(f"NUTS definition data inserted in {runtime_df.loc[0, 'nuts_def']} {unit}.")
print(f"Variable types inserted in {runtime_df.loc[0, 'var_type']} {unit}.")
print(f"Grid points inserted in {runtime_df.loc[0, 'grid_point']} {unit}.")
print(f"Time points inserted in {runtime_df.loc[0, 'time_point']} {unit}.")
print(f"ID maps retrieved in {runtime_df.loc[0, 'get_id_map']} {unit}.")
print(
f"t2m variable values prepared in {runtime_df.loc[0, 'prepare_insert_t2m']} {unit}."
)
print(f"t2m variable values inserted in {runtime_df.loc[0, 'inserted_t2m']} {unit}.")
print(
f"tp variable values prepared in {runtime_df.loc[0, 'prepare_insert_tp']} {unit}."
)
print(f"tp variable values inserted in {runtime_df.loc[0, 'inserted_tp']} {unit}.")
print(
f"Population data converted from yearly to monthly in {runtime_df.loc[0, 'popu_yearly_to_monthly']} {unit}."
)
print(
f"Population variable values prepared in {runtime_df.loc[0, 'prepare_insert_popu']} {unit}."
)
print(
f"Population variable values inserted in {runtime_df.loc[0, 'inserted_popu']} {unit}."
)
print(
f"R0 variable values prepared in {runtime_df.loc[0, 'prepare_insert_R0']} {unit}."
)
print(f"R0 variable values inserted in {runtime_df.loc[0, 'inserted_R0']} {unit}.")
print(f"Total execution time: {total_time} {unit}.")
Retrieve data from database¶
In [ ]:
Copied!
from sqlalchemy import create_engine
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
db_url = "postgresql+psycopg2://postgres:postgres@localhost:5432/onehealth_db"
# get the database engine if needed
engine = create_engine(db_url)
from sqlalchemy import create_engine
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
db_url = "postgresql+psycopg2://postgres:postgres@localhost:5432/onehealth_db"
# get the database engine if needed
engine = create_engine(db_url)
In [ ]:
Copied!
latitude = -6.25
longitude = 106.75
year = 2021
month = 1
day = 1
var_name = "total-population"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
latitude = -6.25
longitude = 106.75
year = 2021
month = 1
day = 1
var_name = "total-population"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
In [ ]:
Copied!
latitude = 49.25
longitude = 8.75
year = 2016
month = 2
day = 1
var_name = "t2m"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
latitude = 49.25
longitude = 8.75
year = 2016
month = 2
day = 1
var_name = "t2m"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
In [ ]:
Copied!
latitude = 49.25
longitude = 8.75
year = 2016
month = 2
day = 1
var_name = "R0"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
latitude = 49.25
longitude = 8.75
year = 2016
month = 2
day = 1
var_name = "R0"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(
retrieve_session, var_name, latitude, longitude, year, month, day
)
retrieve_session.close()
t_end_retrieving = time.time()
print(
f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds."
)
In [ ]:
Copied!