{ "cells": [ { "cell_type": "markdown", "id": "98e6e23d-5ca7-4706-b1d9-dd57b54888ef", "metadata": {}, "source": [ "# Data preprocessing for further calculations" ] }, { "cell_type": "markdown", "id": "5324ceb9-24e7-454b-87b9-ba9a717078ae", "metadata": {}, "source": [ "### Import libraries" ] }, { "cell_type": "code", "execution_count": 2, "id": "7b2a7f44-b0cb-4471-a0c6-e56da23caf86", "metadata": {}, "outputs": [], "source": [ "import datetime as dt\n", "\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "6b2b903f-fa30-4e35-97e1-74bc0ee6b944", "metadata": {}, "source": [ "### Helper variables" ] }, { "cell_type": "code", "execution_count": 3, "id": "36b9f49e-32e6-4544-a9d3-f6a8ba49d867", "metadata": {}, "outputs": [], "source": [ "# also available at https://eee.ipfran.ru/files/seasonal-variation-2024/\n", "# attention: the files are very large (~ 350 GB totally)\n", "src_path = \"../shared_files/eee_public_files/seasonal-variation-2024/\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "78a4350c-59fb-479a-b7cd-e2bf9b996d36", "metadata": {}, "outputs": [], "source": [ "# used numbers of simulated days for analysis\n", "wrf_N_days = 4992\n", "inmcm_N_days = 3650" ] }, { "cell_type": "code", "execution_count": 5, "id": "53cb9cc3-0e56-4da4-920b-2f071a0846fb", "metadata": {}, "outputs": [], "source": [ "# dates corresponding to the indices (0 axis) of the data arrays\n", "# note: for WRF dates correspond to real dates\n", "\n", "wrf_dt_indicies = np.array(\n", " [dt.date(1980, 1, 1) + dt.timedelta(i * 3) for i in range(wrf_N_days)]\n", ")\n", "inmcm_dt_indicies = np.array(\n", " [dt.date(2022, 1, 1) + dt.timedelta(i % 365) for i in range(inmcm_N_days)]\n", ")" ] }, { "cell_type": "markdown", "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7", "metadata": {}, "source": [ "### Preprocessing WRF T2m data" ] }, { "cell_type": "code", "execution_count": 18, "id": "87860fa8-0a9c-4304-9c3c-94561c3e966c", "metadata": {}, "outputs": [], "source": [ "# air temperature at the height of 2 m with the shape\n", "# (number of days, number of hours, number of latitudes, number of longitudes)\n", "# contains temperature values depending on (d, h, lat, lon)\n", "# d (axis 0) is the number of a day starting with 0 and ending with 5113\n", "# every third day is taken\n", "# d = 0 corresponds to 1 Jan 1980, \n", "# d = 5113 corresponds to 30 Dec 2021\n", "# d = 4991 corresponds to 29 Dec 2020\n", "# (we will restrict our attention to 1980–2020)\n", "# h (axis 1) is the hour of the day (an integer in [0, 25])\n", "# the values corresponding to h = 0 and h = 24 are the same\n", "# lat (axis 2) describes the latitude (an integer in [0, 179]) \n", "# lon (axis 3) describes the longitude (an integer in [0, 359])\n", "\n", "wrf_T2_data = np.load(f\"{src_path}/WRF-T2-MAP.npy\")[:wrf_N_days, :24]\n", "wrf_T2_data_DAYxLAT = wrf_T2_data.mean(axis=(1, 3))" ] }, { "cell_type": "code", "execution_count": 19, "id": "1124d9f9-95d9-4c02-8176-82b9c0331d34", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4992, 180)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wrf_T2_data_DAYxLAT.shape" ] }, { "cell_type": "code", "execution_count": 20, "id": "ec569ffd-93c2-4490-8ba1-69af4fab8f23", "metadata": {}, "outputs": [], "source": [ "# air temperature averaged over latitudes and months\n", "wrf_mon_T2 = np.zeros((180, 12))\n", "\n", "for month_idx in range(12):\n", " # filter indicies by month number\n", " monthly_indicies = [\n", " i for i, date in enumerate(wrf_dt_indicies) if date.month == month_idx + 1\n", " ]\n", "\n", " # putting values at specific month into averaged array\n", " wrf_mon_T2[:, month_idx] = wrf_T2_data_DAYxLAT[monthly_indicies].mean(axis=0)-273.15\n", "\n", "np.save(f\"./data/WRF/WRF_T2_LATxMON.npy\",wrf_mon_T2)" ] }, { "cell_type": "code", "execution_count": 17, "id": "b480c05f-4b06-4d33-9527-dbe2655ed251", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27.894258059212177" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wrf_mon_T2.max()" ] }, { "cell_type": "markdown", "id": "46d4f093-a420-42c7-b885-a8409d9d8ee4", "metadata": {}, "source": [ "### Preprocessing INMCM and WRF IP: classic parametrization" ] }, { "cell_type": "code", "execution_count": null, "id": "94a603c3-982d-4c78-be1c-bb6c74b86b5b", "metadata": {}, "outputs": [], "source": [ "# dictionaries where processed data is saved\n", "# the dictionary keys represent the threshold value of CAPE\n", "\n", "# for storing arrays with averaged by hours and summarized by longitude,\n", "# i.e. with dimensions (4992, 180) for WRF and (3650, 120) for INMCM\n", "wrf_daily_latitudal_ip = {}\n", "inmcm_daily_latitudal_ip = {}\n", "\n", "# for storing arrays summarized by longitude and latitude,\n", "# i.e. with dimensions (4992, 24) for WRF and (3650, 24) for INMCM\n", "wrf_hourly_total_ip = {}\n", "inmcm_hourly_total_ip = {}" ] }, { "cell_type": "code", "execution_count": null, "id": "d8e43c4f-59af-483c-8979-535c696abb4e", "metadata": {}, "outputs": [], "source": [ "# iterating over the CAPE threshold (J/kg) values used in modeling \n", "# for each threshold, there are corresponding model datasets\n", "for cape_thres in [800, 1000, 1200]:\n", "\n", " # grid cell contributions to the IP (not normalised) with the shape\n", " # (number of days, number of hours, number of latitudes, number of longitudes)\n", " wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-{cape_thres}.npy\")[:wrf_N_days]\n", " # modelled using WRF model with CAPE threshold = `cape_thres` J/kg\n", " # contains values of contributions to the IP depending on (d, h, lat, lon)\n", " # d (axis 0) is the number of a day starting with 0 and ending with 5113\n", " # every third day is taken\n", " # d = 0 corresponds to 1 Jan 1980, \n", " # d = 5113 corresponds to 30 Dec 2021\n", " # d = 4991 corresponds to 29 Dec 2020\n", " # (we will restrict our attention to 1980–2020)\n", " # h (axis 1) is the hour of the day (an integer in [0, 24])\n", " # the values corresponding to h = 0 and h = 24 are the same\n", " # lat (axis 2) describes the latitude (an integer in [0, 179]) \n", " # lon (axis 3) describes the longitude (an integer in [0, 359])\n", "\n", " # discarding the last hour, which duplicates the first one\n", " wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n", " \n", " # normalisation of contributions to the IP to the global mean of 240 kV\n", " wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n", "\n", " # filling dictionaries with averaged arrays\n", " wrf_daily_latitudal_ip[cape_thres] = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n", " wrf_hourly_total_ip[cape_thres] = wrf_raw_ip_data.sum(axis=(-2, -1))\n", "\n", " np.save(f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", " wrf_hourly_total_ip[cape_thres])\n", "\n", " # grid cell contributions to the IP (not normalised) reshaped to\n", " # (number of days, number of hours, number of latitudes, number of longitudes)\n", " inmcm_raw_ip_data = np.load(f\"{src_path}/INMCM-IP-MAP-{cape_thres}.npy\")\\\n", " .reshape((inmcm_N_days, 24, 120, 180))\n", " # modelled using INMCM model with CAPE threshold = `cape_thres` J/kg\n", " # contains values of contributions to the IP depending on (d, h, lat, lon)\n", " # d (axis 0) is the number of a day (not correspond to real days,\n", " # 10 consecutive 365-day years have been simulated)\n", " # h (axis 1) is the hour of the day (an integer in [0, 23])\n", " # lat (axis 2) describes the latitude (an integer in [0, 179]) \n", " # lon (axis 3) describes the longitude (an integer in [0, 359])\n", "\n", " # normalisation of contributions to the IP to the global mean of 240 kV\n", " inmcm_raw_ip_data /= (1/240e3) * inmcm_raw_ip_data.sum(axis=(-2,-1)).mean()\n", "\n", " # filling dictionaries with averaged arrays\n", " inmcm_daily_latitudal_ip[cape_thres] = inmcm_raw_ip_data.mean(axis=1).sum(axis=-1)\n", " inmcm_hourly_total_ip[cape_thres] = inmcm_raw_ip_data.sum(axis=(-2, -1))\n", "\n", " np.save(f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", " inmcm_hourly_total_ip[cape_thres])" ] }, { "cell_type": "code", "execution_count": null, "id": "eb28cbc7-eb0a-49be-8cc1-734bba1d06f5", "metadata": {}, "outputs": [], "source": [ "# iterating over the CAPE threshold (J/kg) values used in modeling \n", "# for each threshold, there are corresponding model datasets\n", "for cape_thres in [800, 1000, 1200]:\n", "\n", " # initialization of an arrays to store time-averaged data over months\n", " wrf_data_LATxMON = np.zeros((180, 12))\n", " inmcm_data_LATxMON = np.zeros((120, 12))\n", "\n", " # iteration over month number (starting with 0)\n", " for month_idx in range(12):\n", "\n", " # filtering day indices belonging to a specific month\n", " wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n", " if date.month == month_idx + 1]\n", " inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n", " if date.month == month_idx + 1]\n", "\n", " # filling with modeling values with a CAPE threshold \n", " # averaged over months of the year\n", " wrf_data_MONxLAT[:, month_idx] = \\\n", " wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", " inmcm_data_LATxMON[:, month_idx] = \\\n", " inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", "\n", " np.save(f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n", " wrf_data_MONxLAT)\n", " np.save(f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n", " inmcm_data_LATxMON)" ] }, { "cell_type": "markdown", "id": "91bc6d7a-393c-4078-9a6d-1955393d55f5", "metadata": {}, "source": [ "### Preprocessing WRF IP: new parametrization" ] }, { "cell_type": "code", "execution_count": null, "id": "2b6f987e-ba3c-4371-af7b-c9857a7d33d9", "metadata": {}, "outputs": [], "source": [ "# grid cell contributions to the IP (not normalised) reshaped to\n", "# (number of days, number of hours, number of latitudes, number of longitudes)\n", "wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-500-T2-25.npy\")[:wrf_N_days]\n", "# modelled using WRF model using new parametrization based on\n", "# CAPE and T2 with corresponding thresholds 500 J/kg and 25°C.\n", "# contains values of contributions to the IP depending on (d, h, lat, lon)\n", "# d (axis 0) is the number of a day starting with 0 and ending with 5113\n", "# every third day is taken\n", "# d = 0 corresponds to 1 Jan 1980, \n", "# d = 5113 corresponds to 30 Dec 2021\n", "# d = 4991 corresponds to 29 Dec 2020\n", "# (we will restrict our attention to 1980–2020)\n", "# h (axis 1) is the hour of the day (an integer in [0, 24])\n", "# the values corresponding to h = 0 and h = 24 are the same\n", "# lat (axis 2) describes the latitude (an integer in [0, 179]) \n", "# lon (axis 3) describes the longitude (an integer in [0, 359])\n", "\n", "# discarding the last hour, which duplicates the first one\n", "wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n", "\n", "# normalisation of contributions to the IP to the global mean of 240 kV\n", "wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n", "\n", "# filling dictionaries with averaged arrays\n", "wrf_daily_latitudal_ip = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n", "wrf_hourly_total_ip = wrf_raw_ip_data.sum(axis=(-2, -1))\n", "\n", "np.save(\n", " f\"./data/WRF/WRF_HOURLY_TOTAL_IP_500_T2_25.npy\",\n", " wrf_hourly_total_ip,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "5797fffa-a795-4241-a574-6c95f0195a5d", "metadata": {}, "outputs": [], "source": [ "# iterating over the CAPE threshold (J/kg) values used in modeling \n", "# for each threshold, there are corresponding model datasets\n", "for cape_thres in [800, 1000, 1200]:\n", "\n", " # initialization of an arrays to store time-averaged data over months\n", " wrf_data_LATxMON = np.zeros((180, 12))\n", " inmcm_data_LATxMON = np.zeros((120, 12))\n", "\n", " # iteration over month number (starting with 0)\n", " for month_idx in range(12):\n", "\n", " # filtering day indices belonging to a specific month\n", " wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n", " if date.month == month_idx + 1]\n", " inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n", " if date.month == month_idx + 1]\n", "\n", " # filling with modeling values with a CAPE threshold \n", " # averaged over months of the year\n", " wrf_data_MONxLAT[:, month_idx] = \\\n", " wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", " inmcm_data_LATxMON[:, month_idx] = \\\n", " inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "17036c19-95f8-40df-a6c9-f8a23cf426f6", "metadata": {}, "outputs": [], "source": [ "# initialization of a array to store time-averaged data over months\n", "wrf_data_LATxMON = np.zeros((180, 12))\n", "\n", "# iteration over month number (starting with 0)\n", "for month_idx in range(12):\n", " # filtering day indices belonging to a specific month\n", " monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies)\n", " if date.month == month_idx + 1]\n", "\n", " # filling with modeling values averaged over months of the year\n", " wrf_data_LATxMON[:, month_idx] = \\\n", " wrf_daily_latitudal_ip[monthly_indicies].mean(axis=0)\n", "\n", "np.save(f\"./data/WRF/WRF_IP_500_T2_25_LATxMON.npy\", wrf_data_LATxMON)" ] }, { "cell_type": "markdown", "id": "e24297fc-cf81-4ea7-9a80-cdcaf277474a", "metadata": {}, "source": [ "### Saving number of days (used for monthly mean) for each month" ] }, { "cell_type": "code", "execution_count": null, "id": "894ad630-17a5-4744-907e-a07768ff7848", "metadata": {}, "outputs": [], "source": [ "# saving the number of days for each month\n", "# necessary for correct averaging due to \n", "# different numbers of days in different months\n", "\n", "wrf_days = np.array([len([i for i, date in enumerate(wrf_dt_indicies) \n", " if date.month == m + 1]) \n", " for m in range(12)])\n", "\n", "inm_days = np.array([len([i for i, date in enumerate(inmcm_dt_indicies) \n", " if date.month == m + 1]) \n", " for m in range(12)])\n", "\n", "np.save(f\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)\n", "np.save(f\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)\n", "\n", "# for average over months use\n", "# `(wrf_data_LATxMON[:, :].sum(axis=0)*days).sum()/days.sum()`\n", "# unstead\n", "# `wrf_data_LATxMON[:, :].sum(axis=0).mean()`\n", "# because\n", "# `((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5`" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 5 }