Inserting data into the database and estimating execution time¶
In [ ]:
Copied!
from onehealth_db import postgresql_database as db
from pathlib import Path
import time
import xarray as xr
import pandas as pd
import os
from onehealth_db import postgresql_database as db
from pathlib import Path
import time
import xarray as xr
import pandas as pd
import os
Set up necessary variables¶
In [ ]:
Copied!
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
# initialize the database
engine = db.initialize_database(db_url, replace=True)
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
# initialize the database
engine = db.initialize_database(db_url, replace=True)
In [ ]:
Copied!
# record running time
run_time = {}
# variable types
var_types = [
{
"name": "t2m",
"unit": "Celsius",
"description": "2m temperature"
},
{
"name": "tp",
"unit": "mm",
"description": "Total precipitation"
},
{
"name": "total-population",
"unit": "1",
"description": "Total population"
}
]
# record running time
run_time = {}
# variable types
var_types = [
{
"name": "t2m",
"unit": "Celsius",
"description": "2m temperature"
},
{
"name": "tp",
"unit": "mm",
"description": "Total precipitation"
},
{
"name": "total-population",
"unit": "1",
"description": "Total population"
}
]
Add data into the database¶
In [ ]:
Copied!
# start recording time
t0 = time.time()
# start recording time
t0 = time.time()
In [ ]:
Copied!
# paths to data
data_path = Path("../../../data")
shapefile_path = data_path / "in" / "NUTS_RG_20M_2024_4326.shp"
era5_land_path = data_path / "out" / "era5_data_2020_to_2025_all_2t_tp_monthly_celsius_mm_resampled_05degree_trim.nc"
isimip_path = data_path / "in" / "population_histsoc_30arcmin_annual_1950_2021_renamed.nc"
# paths to data
data_path = Path("../../../data")
shapefile_path = data_path / "in" / "NUTS_RG_20M_2024_4326.shp"
era5_land_path = data_path / "out" / "era5_data_2020_to_2025_all_2t_tp_monthly_celsius_mm_resampled_05degree_trim.nc"
isimip_path = data_path / "in" / "population_histsoc_30arcmin_annual_1950_2021_renamed.nc"
In [ ]:
Copied!
# add NUTS definition data
db.insert_nuts_def(engine, shapefile_path)
t_nuts_def = time.time()
# add NUTS definition data
db.insert_nuts_def(engine, shapefile_path)
t_nuts_def = time.time()
In [ ]:
Copied!
# add variable types
var_type_session = db.create_session(engine)
db.insert_var_types(var_type_session, var_types)
var_type_session.close()
t_var_type = time.time()
# add variable types
var_type_session = db.create_session(engine)
db.insert_var_types(var_type_session, var_types)
var_type_session.close()
t_var_type = time.time()
In [ ]:
Copied!
era5_ds = xr.open_dataset(era5_land_path, chunks={})
isimip_ds = xr.open_dataset(isimip_path, chunks={})
# rechunk the dataset
era5_ds = era5_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
isimip_ds = isimip_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
# add grid points
grid_point_session = db.create_session(engine)
db.insert_grid_points(grid_point_session,
latitudes=era5_ds.latitude.to_numpy(),
longitudes=era5_ds.longitude.to_numpy())
grid_point_session.close()
t_grid_point = time.time()
era5_ds = xr.open_dataset(era5_land_path, chunks={})
isimip_ds = xr.open_dataset(isimip_path, chunks={})
# rechunk the dataset
era5_ds = era5_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
isimip_ds = isimip_ds.chunk({"time": 1, "latitude": 180, "longitude": 360})
# add grid points
grid_point_session = db.create_session(engine)
db.insert_grid_points(grid_point_session,
latitudes=era5_ds.latitude.to_numpy(),
longitudes=era5_ds.longitude.to_numpy())
grid_point_session.close()
t_grid_point = time.time()
In [ ]:
Copied!
# add time points
time_point_session = db.create_session(engine)
db.insert_time_points(time_point_session,
time_point_data=[(era5_ds.time.to_numpy(), False),
(isimip_ds.time.to_numpy(), True)]) # True means yearly data
time_point_session.close()
t_time_point = time.time()
# add time points
time_point_session = db.create_session(engine)
db.insert_time_points(time_point_session,
time_point_data=[(era5_ds.time.to_numpy(), False),
(isimip_ds.time.to_numpy(), True)]) # True means yearly data
time_point_session.close()
t_time_point = time.time()
In [ ]:
Copied!
# get id maps for grid, time, and variable types
id_map_session = db.create_session(engine)
grid_id_map, time_id_map, var_type_id_map = db.get_id_maps(id_map_session)
id_map_session.close()
t_get_id_map = time.time()
# get id maps for grid, time, and variable types
id_map_session = db.create_session(engine)
grid_id_map, time_id_map, var_type_id_map = db.get_id_maps(id_map_session)
id_map_session.close()
t_get_id_map = time.time()
In [ ]:
Copied!
# add t2m values
_, t_start_insert_t2m = db.insert_var_values(engine, era5_ds, "t2m", grid_id_map, time_id_map, var_type_id_map)
t_inserted_t2m = time.time()
# add t2m values
_, t_start_insert_t2m = db.insert_var_values(engine, era5_ds, "t2m", grid_id_map, time_id_map, var_type_id_map)
t_inserted_t2m = time.time()
In [ ]:
Copied!
# add total precipitation values
_, t_start_insert_tp = db.insert_var_values(engine, era5_ds, "tp", grid_id_map, time_id_map, var_type_id_map)
t_inserted_tp = time.time()
# add total precipitation values
_, t_start_insert_tp = db.insert_var_values(engine, era5_ds, "tp", grid_id_map, time_id_map, var_type_id_map)
t_inserted_tp = time.time()
In [ ]:
Copied!
# add population data
t_yearly_to_monthly, t_start_insert_popu = db.insert_var_values(engine,
isimip_ds,
"total-population",
grid_id_map, time_id_map, var_type_id_map,
to_monthly=False)
t_inserted_popu = time.time()
# add population data
t_yearly_to_monthly, t_start_insert_popu = db.insert_var_values(engine,
isimip_ds,
"total-population",
grid_id_map, time_id_map, var_type_id_map,
to_monthly=False)
t_inserted_popu = time.time()
In [ ]:
Copied!
t_end = time.time()
t_end = time.time()
In [ ]:
Copied!
# calculate execution time
run_time["nuts_def"] = t_nuts_def - t0
run_time["var_type"] = t_var_type - t_nuts_def
run_time["grid_point"] = t_grid_point - t_var_type
run_time["time_point"] = t_time_point - t_grid_point
run_time["get_id_map"] = t_get_id_map - t_time_point
run_time["prepare_insert_t2m"] = t_start_insert_t2m - t_get_id_map
run_time["inserted_t2m"] = t_inserted_t2m - t_start_insert_t2m
run_time["prepare_insert_tp"] = t_start_insert_tp - t_inserted_t2m
run_time["inserted_tp"] = t_inserted_tp - t_start_insert_tp
run_time["popu_yearly_to_monthly"] = t_yearly_to_monthly - t_inserted_tp
run_time["prepare_insert_popu"] = t_start_insert_popu - t_yearly_to_monthly
run_time["inserted_popu"] = t_inserted_popu - t_start_insert_popu
total_time = t_end - t0
# calculate execution time
run_time["nuts_def"] = t_nuts_def - t0
run_time["var_type"] = t_var_type - t_nuts_def
run_time["grid_point"] = t_grid_point - t_var_type
run_time["time_point"] = t_time_point - t_grid_point
run_time["get_id_map"] = t_get_id_map - t_time_point
run_time["prepare_insert_t2m"] = t_start_insert_t2m - t_get_id_map
run_time["inserted_t2m"] = t_inserted_t2m - t_start_insert_t2m
run_time["prepare_insert_tp"] = t_start_insert_tp - t_inserted_t2m
run_time["inserted_tp"] = t_inserted_tp - t_start_insert_tp
run_time["popu_yearly_to_monthly"] = t_yearly_to_monthly - t_inserted_tp
run_time["prepare_insert_popu"] = t_start_insert_popu - t_yearly_to_monthly
run_time["inserted_popu"] = t_inserted_popu - t_start_insert_popu
total_time = t_end - t0
In [ ]:
Copied!
runtime_df = pd.DataFrame(run_time, index=[0])
# convert seconds to minutes
runtime_df = (runtime_df / 60).round(4)
total_time = round(total_time / 60, 4)
runtime_df.to_csv(data_path / f"postgres_runtime_{time.strftime("%Y-%m-%d")}.csv", index=False)
runtime_df.plot.bar()
runtime_df = pd.DataFrame(run_time, index=[0])
# convert seconds to minutes
runtime_df = (runtime_df / 60).round(4)
total_time = round(total_time / 60, 4)
runtime_df.to_csv(data_path / f"postgres_runtime_{time.strftime("%Y-%m-%d")}.csv", index=False)
runtime_df.plot.bar()
In [ ]:
Copied!
unit = "minutes"
print(f"NUTS definition data inserted in {runtime_df.loc[0, 'nuts_def']} {unit}.")
print(f"Variable types inserted in {runtime_df.loc[0, 'var_type']} {unit}.")
print(f"Grid points inserted in {runtime_df.loc[0, 'grid_point']} {unit}.")
print(f"Time points inserted in {runtime_df.loc[0, 'time_point']} {unit}.")
print(f"ID maps retrieved in {runtime_df.loc[0, 'get_id_map']} {unit}.")
print(f"t2m variable values prepared in {runtime_df.loc[0, 'prepare_insert_t2m']} {unit}.")
print(f"t2m variable values inserted in {runtime_df.loc[0, 'inserted_t2m']} {unit}.")
print(f"tp variable values prepared in {runtime_df.loc[0, 'prepare_insert_tp']} {unit}.")
print(f"tp variable values inserted in {runtime_df.loc[0, 'inserted_tp']} {unit}.")
print(f"Population data converted from yearly to monthly in {runtime_df.loc[0, 'popu_yearly_to_monthly']} {unit}.")
print(f"Population variable values prepared in {runtime_df.loc[0, 'prepare_insert_popu']} {unit}.")
print(f"Population variable values inserted in {runtime_df.loc[0, 'inserted_popu']} {unit}.")
print(f"Total execution time: {total_time} {unit}.")
unit = "minutes"
print(f"NUTS definition data inserted in {runtime_df.loc[0, 'nuts_def']} {unit}.")
print(f"Variable types inserted in {runtime_df.loc[0, 'var_type']} {unit}.")
print(f"Grid points inserted in {runtime_df.loc[0, 'grid_point']} {unit}.")
print(f"Time points inserted in {runtime_df.loc[0, 'time_point']} {unit}.")
print(f"ID maps retrieved in {runtime_df.loc[0, 'get_id_map']} {unit}.")
print(f"t2m variable values prepared in {runtime_df.loc[0, 'prepare_insert_t2m']} {unit}.")
print(f"t2m variable values inserted in {runtime_df.loc[0, 'inserted_t2m']} {unit}.")
print(f"tp variable values prepared in {runtime_df.loc[0, 'prepare_insert_tp']} {unit}.")
print(f"tp variable values inserted in {runtime_df.loc[0, 'inserted_tp']} {unit}.")
print(f"Population data converted from yearly to monthly in {runtime_df.loc[0, 'popu_yearly_to_monthly']} {unit}.")
print(f"Population variable values prepared in {runtime_df.loc[0, 'prepare_insert_popu']} {unit}.")
print(f"Population variable values inserted in {runtime_df.loc[0, 'inserted_popu']} {unit}.")
print(f"Total execution time: {total_time} {unit}.")
Retrieve data from database¶
In [ ]:
Copied!
from sqlalchemy import create_engine
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
# get the database engine if needed
engine = create_engine(db_url)
from sqlalchemy import create_engine
# PostgreSQL database URL
# replace with your actual database URL
# e.g. "postgresql+psycopg2://user:password@localhost:port/mydatabase"
db_url = os.getenv("DB_URL")
# get the database engine if needed
engine = create_engine(db_url)
In [ ]:
Copied!
latitude = -6.25
longitude=106.75
year = 2021
month = 1
day = 1
var_name = "total-population"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(retrieve_session, var_name, latitude, longitude, year, month, day)
retrieve_session.close()
t_end_retrieving = time.time()
print(f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds.")
latitude = -6.25
longitude=106.75
year = 2021
month = 1
day = 1
var_name = "total-population"
t_start_retrieving = time.time()
retrieve_session = db.create_session(engine)
var_value = db.get_var_value(retrieve_session, var_name, latitude, longitude, year, month, day)
retrieve_session.close()
t_end_retrieving = time.time()
print(f"Retrieved {var_name} value: {var_value} in {t_end_retrieving - t_start_retrieving} seconds.")