diff --git a/0_prepare_data.ipynb b/0_prepare_data.ipynb index 84955e7..c3688d4 100644 --- a/0_prepare_data.ipynb +++ b/0_prepare_data.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "7b2a7f44-b0cb-4471-a0c6-e56da23caf86", "metadata": {}, "outputs": [], @@ -29,9 +29,17 @@ "import pandas as pd" ] }, + { + "cell_type": "markdown", + "id": "6b2b903f-fa30-4e35-97e1-74bc0ee6b944", + "metadata": {}, + "source": [ + "### Helper variables" + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "36b9f49e-32e6-4544-a9d3-f6a8ba49d867", "metadata": {}, "outputs": [], @@ -41,29 +49,29 @@ "src_path = \"../shared_files/eee_public_files/seasonal-variation-2024/\"" ] }, - { - "cell_type": "markdown", - "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7", - "metadata": {}, - "source": [ - "### Preprocessing WRF T2m data" - ] - }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "78a4350c-59fb-479a-b7cd-e2bf9b996d36", "metadata": {}, "outputs": [], "source": [ - "# available numbers of simulated days for analysis\n", + "# used numbers of simulated days for analysis\n", "wrf_N_days = 4992\n", "inmcm_N_days = 3650" ] }, + { + "cell_type": "markdown", + "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7", + "metadata": {}, + "source": [ + "### Preprocessing WRF T2m data" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "53cb9cc3-0e56-4da4-920b-2f071a0846fb", "metadata": {}, "outputs": [], @@ -81,51 +89,48 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": null, "id": "0e4d0268-208f-45c7-bb0a-b5e8d3c7c1f7", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(4992, 180, 360)" - ] - }, - "execution_count": 147, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "wrf_T2_data = np.load(f\"{src_path}/T2-MAP-FULL.npy\")[:wrf_N_days]\n", - "wrf_T2_data.shape" + "# air temperature at the height of 2 m with the shape\n", + "# (number of days, number of latitudes, number of longitudes)\n", + "\n", + "# contains temperature values depending on (d, lat, lon)\n", + "\n", + "# d (axis 0) is the number of a day starting with 0 and ending with 5113\n", + "# every third day is taken\n", + "# d = 0 corresponds to 1 Jan 1980, d = 5113 corresponds to 30 Dec 2021\n", + "# d = 4991 corresponds to 29 Dec 2020\n", + "# (we will restrict our attention to 1980–2020)\n", + "\n", + "# lat (axis 2) describes the latitude (an integer in [0, 179])\n", + "\n", + "# lon (axis 3) describes the longitude (an integer in [0, 359])\n", + "\n", + "wrf_T2_data = np.load(f\"{src_path}/T2-MAP-FULL.npy\")[:wrf_N_days]" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": null, "id": "ec569ffd-93c2-4490-8ba1-69af4fab8f23", "metadata": {}, "outputs": [], "source": [ - "# mean surface air temperature values for different latitudes and months\n", + "# air temperature averaged over latitudes and months\n", "wrf_mon_T2 = np.zeros((180, 12))\n", "\n", "for month_idx in range(12):\n", + " # filter indicies by month number\n", " monthly_indicies = [\n", " i for i, date in enumerate(wrf_dt_indicies) if date.month == month_idx + 1\n", - " ] # indicies of days available for `month_idx+1` month\n", + " ]\n", + "\n", + " # putting values at specific month into averaged array\n", + " wrf_mon_T2[:, month_idx] = wrf_mean_T2[monthly_indicies].mean(axis=0)\n", "\n", - " wrf_mon_T2[:, month_idx] = wrf_mean_T2[monthly_indicies].mean(axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "id": "08302aa4-cb14-47f9-8216-a9db06ae53ef", - "metadata": {}, - "outputs": [], - "source": [ "np.save(f\"./data/WRF/WRF_T2_LATxMON.npy\",wrf_mon_T2)" ] }, @@ -139,102 +144,123 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "id": "94a603c3-982d-4c78-be1c-bb6c74b86b5b", "metadata": {}, "outputs": [], "source": [ + "# dictionaries where processed data is saved\n", + "# the dictionary keys represent the threshold value of CAPE\n", + "\n", + "# for storing arrays with averaged by hours and summarized by longitude,\n", + "# i.e. with dimensions (4992, 180) for WRF and (3650, 120) for INMCM\n", "wrf_daily_latitudal_ip = {}\n", "inmcm_daily_latitudal_ip = {}\n", "\n", + "# for storing arrays summarized by longitude and latitude,\n", + "# i.e. with dimensions (4992, 24) for WRF and (3650, 24) for INMCM\n", "wrf_hourly_total_ip = {}\n", "inmcm_hourly_total_ip = {}" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": null, "id": "d8e43c4f-59af-483c-8979-535c696abb4e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "800\n", - "1000\n", - "1200\n" - ] - } - ], + "outputs": [], "source": [ - "for cape_thres in [800, 1000, 1200]: # J/kg\n", - " print(cape_thres)\n", + "# iterating over the CAPE threshold (J/kg) values used in modeling \n", + "# for each threshold, there are corresponding model datasets\n", + "for cape_thres in [800, 1000, 1200]:\n", "\n", " # grid cell contributions to the IP (not normalised) with the shape\n", " # (number of days, number of hours, number of latitudes, number of longitudes)\n", " wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-{cape_thres}.npy\")[:wrf_N_days]\n", + " # modelled using WRF model with CAPE threshold = `cape_thres` J/kg\n", + " # contains values of contributions to the IP depending on (d, h, lat, lon)\n", + " # d (axis 0) is the number of a day starting with 0 and ending with 5113\n", + " # every third day is taken\n", + " # d = 0 corresponds to 1 Jan 1980, \n", + " # d = 5113 corresponds to 30 Dec 2021\n", + " # d = 4991 corresponds to 29 Dec 2020\n", + " # (we will restrict our attention to 1980–2020)\n", + " # h (axis 1) is the hour of the day (an integer in [0, 24])\n", + " # the values corresponding to h = 0 and h = 24 are the same\n", + " # lat (axis 2) describes the latitude (an integer in [0, 179]) \n", + " # lon (axis 3) describes the longitude (an integer in [0, 359])\n", + "\n", + " # discarding the last hour, which duplicates the first one\n", " wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n", + " \n", + " # normalisation of contributions to the IP to the global mean of 240 kV\n", " wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n", "\n", + " # filling dictionaries with averaged arrays\n", " wrf_daily_latitudal_ip[cape_thres] = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n", " wrf_hourly_total_ip[cape_thres] = wrf_raw_ip_data.sum(axis=(-2, -1))\n", "\n", - " inmcm_raw_ip_data = np.load(f\"{src_path}/INMCM-IP-MAP-{cape_thres}.npy\").reshape((inmcm_N_days, 24, 120, 180))\n", + " np.save(f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", + " wrf_hourly_total_ip[cape_thres])\n", + "\n", + " # grid cell contributions to the IP (not normalised) reshaped to\n", + " # (number of days, number of hours, number of latitudes, number of longitudes)\n", + " inmcm_raw_ip_data = np.load(f\"{src_path}/INMCM-IP-MAP-{cape_thres}.npy\")\\\n", + " .reshape((inmcm_N_days, 24, 120, 180))\n", + " # modelled using INMCM model with CAPE threshold = `cape_thres` J/kg\n", + " # contains values of contributions to the IP depending on (d, h, lat, lon)\n", + " # d (axis 0) is the number of a day (not correspond to real days,\n", + " # 10 consecutive 365-day years have been simulated)\n", + " # h (axis 1) is the hour of the day (an integer in [0, 23])\n", + " # lat (axis 2) describes the latitude (an integer in [0, 179]) \n", + " # lon (axis 3) describes the longitude (an integer in [0, 359])\n", + "\n", + " # normalisation of contributions to the IP to the global mean of 240 kV\n", " inmcm_raw_ip_data /= (1/240e3) * inmcm_raw_ip_data.sum(axis=(-2,-1)).mean()\n", - " \n", + "\n", + " # filling dictionaries with averaged arrays\n", " inmcm_daily_latitudal_ip[cape_thres] = inmcm_raw_ip_data.mean(axis=1).sum(axis=-1)\n", " inmcm_hourly_total_ip[cape_thres] = inmcm_raw_ip_data.sum(axis=(-2, -1))\n", "\n", - " del wrf_raw_ip_data\n", - " del inmcm_raw_ip_data" + " np.save(f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", + " inmcm_hourly_total_ip[cape_thres])" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": null, "id": "eb28cbc7-eb0a-49be-8cc1-734bba1d06f5", "metadata": {}, "outputs": [], "source": [ - "for cape_thres in [800, 1000, 1200]: # J/kg\n", - " np.save(\n", - " f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", - " inmcm_hourly_total_ip[cape_thres],\n", - " )\n", - " np.save(\n", - " f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n", - " wrf_hourly_total_ip[cape_thres],\n", - " )\n", + "# iterating over the CAPE threshold (J/kg) values used in modeling \n", + "# for each threshold, there are corresponding model datasets\n", + "for cape_thres in [800, 1000, 1200]:\n", "\n", + " # initialization of an arrays to store time-averaged data over months\n", " wrf_data_LATxMON = np.zeros((180, 12))\n", " inmcm_data_LATxMON = np.zeros((120, 12))\n", "\n", + " # iteration over month number (starting with 0)\n", " for month_idx in range(12):\n", - " monthly_indicies = [\n", - " i for i, date in enumerate(wrf_dt_indicies) if date.month == month_idx + 1\n", - " ] # indicies of days available for `month_idx+1` month\n", - "\n", - " wrf_data_MONxLAT[:, month_idx] = wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(\n", - " axis=0\n", - " )\n", - " np.save(\n", - " f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n", - " wrf_data_MONxLAT,\n", - " )\n", "\n", - " for month_idx in range(12):\n", - " monthly_indicies = [\n", - " i for i, date in enumerate(inmcm_dt_indicies) if date.month == month_idx + 1\n", - " ] # indicies of days available for `month_idx+1` month\n", - "\n", - " inmcm_data_LATxMON[:, month_idx] = inmcm_daily_latitudal_ip[cape_thres][\n", - " monthly_indicies\n", - " ].mean(axis=0)\n", - " np.save(\n", - " f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n", - " inmcm_data_LATxMON,\n", - " )" + " # filtering day indices belonging to a specific month\n", + " wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n", + " if date.month == month_idx + 1]\n", + " inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n", + " if date.month == month_idx + 1]\n", + "\n", + " # filling with modeling values with a CAPE threshold \n", + " # averaged over months of the year\n", + " wrf_data_MONxLAT[:, month_idx] = \\\n", + " wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", + " inmcm_data_LATxMON[:, month_idx] = \\\n", + " inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", + "\n", + " np.save(f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n", + " wrf_data_MONxLAT)\n", + " np.save(f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n", + " inmcm_data_LATxMON)" ] }, { @@ -252,10 +278,30 @@ "metadata": {}, "outputs": [], "source": [ + "# grid cell contributions to the IP (not normalised) reshaped to\n", + "# (number of days, number of hours, number of latitudes, number of longitudes)\n", "wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-500-T2-25.npy\")[:wrf_N_days]\n", + "# modelled using WRF model using new parametrization based on\n", + "# CAPE and T2 with corresponding thresholds 500 J/kg and 25°C.\n", + "# contains values of contributions to the IP depending on (d, h, lat, lon)\n", + "# d (axis 0) is the number of a day starting with 0 and ending with 5113\n", + "# every third day is taken\n", + "# d = 0 corresponds to 1 Jan 1980, \n", + "# d = 5113 corresponds to 30 Dec 2021\n", + "# d = 4991 corresponds to 29 Dec 2020\n", + "# (we will restrict our attention to 1980–2020)\n", + "# h (axis 1) is the hour of the day (an integer in [0, 24])\n", + "# the values corresponding to h = 0 and h = 24 are the same\n", + "# lat (axis 2) describes the latitude (an integer in [0, 179]) \n", + "# lon (axis 3) describes the longitude (an integer in [0, 359])\n", + "\n", + "# discarding the last hour, which duplicates the first one\n", "wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n", + "\n", + "# normalisation of contributions to the IP to the global mean of 240 kV\n", "wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n", "\n", + "# filling dictionaries with averaged arrays\n", "wrf_daily_latitudal_ip = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n", "wrf_hourly_total_ip = wrf_raw_ip_data.sum(axis=(-2, -1))\n", "\n", @@ -267,27 +313,53 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, + "id": "5797fffa-a795-4241-a574-6c95f0195a5d", + "metadata": {}, + "outputs": [], + "source": [ + "# iterating over the CAPE threshold (J/kg) values used in modeling \n", + "# for each threshold, there are corresponding model datasets\n", + "for cape_thres in [800, 1000, 1200]:\n", + "\n", + " # initialization of an arrays to store time-averaged data over months\n", + " wrf_data_LATxMON = np.zeros((180, 12))\n", + " inmcm_data_LATxMON = np.zeros((120, 12))\n", + "\n", + " # iteration over month number (starting with 0)\n", + " for month_idx in range(12):\n", + "\n", + " # filtering day indices belonging to a specific month\n", + " wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n", + " if date.month == month_idx + 1]\n", + " inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n", + " if date.month == month_idx + 1]\n", + "\n", + " # filling with modeling values with a CAPE threshold \n", + " # averaged over months of the year\n", + " wrf_data_MONxLAT[:, month_idx] = \\\n", + " wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n", + " inmcm_data_LATxMON[:, month_idx] = \\\n", + " inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "17036c19-95f8-40df-a6c9-f8a23cf426f6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4992\n" - ] - } - ], + "outputs": [], "source": [ + "# initialization of a array to store time-averaged data over months\n", "wrf_data_LATxMON = np.zeros((180, 12))\n", "\n", + "# iteration over month number (starting with 0)\n", "for month_idx in range(12):\n", - " monthly_indicies = [\n", - " i for i, date in enumerate(wrf_dt_indicies) \n", - " if date.month == month_idx + 1\n", - " ]\n", + " # filtering day indices belonging to a specific month\n", + " monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies)\n", + " if date.month == month_idx + 1]\n", "\n", + " # filling with modeling values averaged over months of the year\n", " wrf_data_LATxMON[:, month_idx] = \\\n", " wrf_daily_latitudal_ip[monthly_indicies].mean(axis=0)\n", "\n", @@ -304,47 +376,32 @@ }, { "cell_type": "code", - "execution_count": 48, - "id": "6dfdae16-cb07-4580-97ba-414b5b2c1f2b", + "execution_count": null, + "id": "894ad630-17a5-4744-907e-a07768ff7848", "metadata": {}, "outputs": [], "source": [ - "wrf_days = np.array([len([\n", - " i for i, date in enumerate(wrf_dt_indicies) \n", - " if date.month == m + 1\n", - " ]) for m in range(12)])\n", + "# saving the number of days for each month\n", + "# necessary for correct averaging due to \n", + "# different numbers of days in different months\n", "\n", - "np.save(f\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "762dd08f-7d1e-4b68-b272-e7aa9b26d9f1", - "metadata": {}, - "outputs": [], - "source": [ - "inm_days = np.array([len([\n", - " i for i, date in enumerate(inmcm_dt_indicies) \n", - " if date.month == m + 1\n", - " ]) for m in range(12)])\n", + "wrf_days = np.array([len([i for i, date in enumerate(wrf_dt_indicies) \n", + " if date.month == m + 1]) \n", + " for m in range(12)])\n", + "\n", + "inm_days = np.array([len([i for i, date in enumerate(inmcm_dt_indicies) \n", + " if date.month == m + 1]) \n", + " for m in range(12)])\n", + "\n", + "np.save(f\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)\n", + "np.save(f\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)\n", "\n", - "np.save(f\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "78afb8e1-3975-4e19-993f-1b0c74c3e9bd", - "metadata": {}, - "outputs": [], - "source": [ "# for average over months use\n", "# `(wrf_data_LATxMON[:, :].sum(axis=0)*days).sum()/days.sum()`\n", "# unstead\n", "# `wrf_data_LATxMON[:, :].sum(axis=0).mean()`\n", "# because\n", - "# ((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5" + "# `((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5`" ] } ],