{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "98e6e23d-5ca7-4706-b1d9-dd57b54888ef",
   "metadata": {},
   "source": [
    "# Data preprocessing for further calculations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5324ceb9-24e7-454b-87b9-ba9a717078ae",
   "metadata": {},
   "source": [
    "### Importing libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7b2a7f44-b0cb-4471-a0c6-e56da23caf86",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime as dt\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b2b903f-fa30-4e35-97e1-74bc0ee6b944",
   "metadata": {},
   "source": [
    "### Helper variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "36b9f49e-32e6-4544-a9d3-f6a8ba49d867",
   "metadata": {},
   "outputs": [],
   "source": [
    "# also available at https://eee.ipfran.ru/files/seasonal-variation-2024\n",
    "# attention: the files are very large (~ 300 GB in total)\n",
    "src_path = \"/home/shared_files/eee_public_files/seasonal-variation-2024\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e85f1bf1-8777-459f-a624-146d1f0440c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4992\n",
      "14965\n"
     ]
    }
   ],
   "source": [
    "# calculating the number of simulated days used for the WRF analysis\n",
    "# (every third day is taken, starting with 1 Jan 1980;\n",
    "# we limit the data to 1980–2020)\n",
    "\n",
    "wrf_days = pd.date_range(start=\"1980-01-01\", end=\"2021-01-01\", freq=\"3D\")\n",
    "wrf_N_days = len(wrf_days[wrf_days.year <= 2020])  # limiting index for WRF days\n",
    "print(wrf_N_days)\n",
    "\n",
    "# calculating the number of simulated days used for the INMCM analysis\n",
    "# (every day is taken, starting with 1 Jan 1979;\n",
    "# we limit the data to 1980–2020)\n",
    "\n",
    "inm_start_year = 1\n",
    "inm_end_year = 42\n",
    "inm_N_days = 365 * (inm_end_year - inm_start_year)\n",
    "print(inm_N_days)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "53cb9cc3-0e56-4da4-920b-2f071a0846fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dates corresponding to the indices of the data arrays\n",
    "# (in the case of the WRF the dates exactly correspond to real dates,\n",
    "# in the case of the INMCM there are 41 365-day years)\n",
    "\n",
    "wrf_dt_indices = np.array(\n",
    "    [dt.date(1980, 1, 1) + dt.timedelta(i * 3) for i in range(wrf_N_days)]\n",
    ")\n",
    "inm_dt_indices = np.array(\n",
    "    [dt.date(2022, 1, 1) + dt.timedelta(i % 365) for i in range(inm_N_days)]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7",
   "metadata": {},
   "source": [
    "### Preprocessing T2 data from the WRF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ccff6ad9-dc83-440f-8642-93cfc150b03f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `WRF-T2-DAILY.npy` contains WRF-simulated\n",
    "# average air temperature values (in °C) at the height of 2 m with the shape\n",
    "# (number of days, number of latitudes, number of longitudes);\n",
    "# the file contains temperature values depending on (d, lat, lon)\n",
    "#   d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
    "#              every third day is taken\n",
    "#              d = 0 corresponds to 1 Jan 1980\n",
    "#              d = 5386 corresponds to 28 Mar 2024\n",
    "#              d = 4991 corresponds to 29 Dec 2020\n",
    "#              (we will restrict our attention to 1980–2020)\n",
    "# lat (axis 3) describes the latitude (an integer in [0, 179]\n",
    "#              corresponding to 1° wide cells within 90°S–90°N)\n",
    "# lon (axis 4) describes the longitude (an integer in [0, 359]\n",
    "#              corresponding to 1° wide cells across each circle of latitudes)\n",
    "\n",
    "wrf_T2_data = np.load(f\"{src_path}/WRF-T2-DAILY.npy\")[:wrf_N_days]\n",
    "wrf_T2_data_DAYxLAT = wrf_T2_data.mean(axis=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a7148097-51d8-492b-80c4-7b4174bba4b4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# initialising an array to store monthly averaged values\n",
    "# for different latitudes\n",
    "wrf_T2_LATxMON = np.zeros((180, 12))\n",
    "\n",
    "# iterating over month numbers (starting with 0)\n",
    "for month_idx in range(12):\n",
    "    # filtering indices by the month number\n",
    "    wrf_monthly_indices = [\n",
    "        i for i, date in enumerate(wrf_dt_indices)\n",
    "        if date.month == month_idx + 1\n",
    "    ]\n",
    "\n",
    "    # putting the values for the specific month into the array\n",
    "    wrf_T2_LATxMON[:, month_idx] = (\n",
    "        wrf_T2_data_DAYxLAT[wrf_monthly_indices].mean(axis=0)\n",
    "    )\n",
    "\n",
    "np.save(\"./data/WRF/WRF_T2_LATxMON.npy\", wrf_T2_LATxMON)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9942db7c-6d78-4d5e-94b3-acf483cb562a",
   "metadata": {},
   "outputs": [],
   "source": [
    "del wrf_T2_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "402dc588-a9ac-4b4f-95fb-bd7f58557643",
   "metadata": {},
   "source": [
    "### Preprocessing the values of CAPE and convective precipitation from the INMCM and WRF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "28ad673c-9668-4cfc-8503-daf42c0e2339",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `WRF-CAPE-00-12-TROP.npy` contains WRF-simulated\n",
    "# average max-theta-e CAPE values (in J/kg) in the tropics with the shape\n",
    "# (number of simulations per day, number of days, number of hours,\n",
    "# number of latitudes, number of longitudes);\n",
    "# the file contains CAPE values for the air parcel at the height of the maximum theta-e\n",
    "# depending on (s, d, h, lat, lon)\n",
    "#   s (axis 0) is the number of simulations\n",
    "#              0 corresponds to the start of the model at 0 UTC the day before\n",
    "#              1 corresponds to the start of the model at 12 UTC the day before\n",
    "#   d (axis 1) is the number of a day starting with 0 and ending with 5113\n",
    "#              every third day is taken\n",
    "#              d = 0 corresponds to 1 Jan 1980\n",
    "#              d = 5386 corresponds to 28 Mar 2024\n",
    "#              d = 4991 corresponds to 29 Dec 2020\n",
    "#              (we will restrict our attention to 1980–2020)\n",
    "#   h (axis 2) is the hour since 18 hours after the model initiation\n",
    "#              (an integer in [0, 24])\n",
    "# lat (axis 3) describes the latitude (an integer in [0, 59]\n",
    "#              corresponding to 1° wide cells within 30°S–30°N)\n",
    "# lon (axis 4) describes the longitude (an integer in [0, 359]\n",
    "#              corresponding to 1° wide cells across each circle of latitudes)\n",
    "\n",
    "wrf_cape_data = np.load(\n",
    "    f\"{src_path}/WRF-CAPE-00-12-TROP.npy\"\n",
    ")[:, :wrf_N_days].flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9eed46eb-b88a-4e0b-a1da-0fa365225196",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `INMCM-SCAPE-TROP.npy` contains INMCM-simulated\n",
    "# average surface CAPE values (in J/kg) in the tropics with the shape\n",
    "# (number of years, number of days in a year, number of hours,\n",
    "# number of latitudes, number of longitudes);\n",
    "# the file contains CAPE values for the air parcel at the surface\n",
    "# depending on (y, d, h, lat, lon)\n",
    "#   y (axis 0) is the number of a year starting with 0 and ending with 42\n",
    "#              y = 0 roughly corresponds to 1979\n",
    "#              y = 42 roughly corresponds to 2021\n",
    "#              y values in [1, 41] correspond to 1980–2020\n",
    "#   d (axis 1) is the number of a day (an integer in [0, 364])\n",
    "#              each model year consists of 365 days\n",
    "#   h (axis 2) is the hour of the day (an integer in [0, 23])\n",
    "# lat (axis 3) describes the latitude (an integer in [0, 39]\n",
    "#              corresponding to 1.5° wide cells within 30°S–30°N)\n",
    "# lon (axis 4) describes the longitude (an integer in [0, 179]\n",
    "#              corresponding to 2° wide cells across each circle of latitude)\n",
    "\n",
    "inm_scape_data = np.load(\n",
    "    f\"{src_path}/INMCM-SCAPE-TROP.npy\"\n",
    ")[inm_start_year:inm_end_year].flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5d17f9bd-f86f-4d5b-ac4f-5401d8588ce9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5391360000 0.0 12336.779\n",
      "2585952000 0.0 11888.347\n"
     ]
    }
   ],
   "source": [
    "for a in [wrf_cape_data, inm_scape_data]:\n",
    "    print(len(a), np.amin(a), np.amax(a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "467a1125-5165-404c-862f-47fec5467922",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculating histogram parameters\n",
    "\n",
    "wrf_cape_hist = np.histogram(wrf_cape_data,\n",
    "                             bins=np.arange(0, 17001, 10))\n",
    "inm_scape_hist = np.histogram(inm_scape_data,\n",
    "                              bins=np.arange(0, 17001, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c38a809c-5b40-40e0-b811-4f27c35e5d58",
   "metadata": {},
   "outputs": [],
   "source": [
    "# saving the data for plotting histograms in other scripts\n",
    "\n",
    "np.savez(\n",
    "    \"./data/WRF/WRF_CAPE_HIST.npz\",\n",
    "    values=wrf_cape_hist[0], bins=wrf_cape_hist[1]\n",
    ")\n",
    "np.savez(\n",
    "    \"./data/INMCM/INMCM_SCAPE_HIST.npz\",\n",
    "    values=inm_scape_hist[0], bins=inm_scape_hist[1]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3388a36e-cd42-4d51-8d66-4e34e24fa015",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `WRF-RAIN-00-12-TROP.npy` contains WRF-simulated\n",
    "# average amounts of convective precipitation in the tropics with the shape\n",
    "# (number of simulations per day, number of days, number of hours,\n",
    "# number of latitudes, number of longitudes);\n",
    "# the file contains hourly values depending on (s, d, h, lat, lon)\n",
    "#   s (axis 0) is the number of simulations\n",
    "#              0 corresponds to the start of the model at 0 UTC the day before\n",
    "#              1 corresponds to the start of the model at 12 UTC the day before\n",
    "#   d (axis 1) is the number of a day starting with 0 and ending with 5113\n",
    "#              every third day is taken\n",
    "#              d = 0 corresponds to 1 Jan 1980\n",
    "#              d = 5386 corresponds to 28 Mar 2024\n",
    "#              d = 4991 corresponds to 29 Dec 2020\n",
    "#              (we will restrict our attention to 1980–2020)\n",
    "#   h (axis 2) is the hour since 18 hours after the model initiation\n",
    "#              (an integer in [0, 24])\n",
    "# lat (axis 3) describes the latitude (an integer in [0, 59]\n",
    "#              corresponding to 1° wide cells within 30°S–30°N)\n",
    "# lon (axis 4) describes the longitude (an integer in [0, 359]\n",
    "#              corresponding to 1° wide cells across each circle of latitudes)\n",
    "\n",
    "wrf_rain_data = np.load(\n",
    "    f\"{src_path}/WRF-RAIN-00-12-TROP.npy\"\n",
    ")[:, :wrf_N_days].flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2c247909-90ce-43d8-9e07-ae0afc34545a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `INMCM-RAIN-TROP.npy` contains INMCM-simulated\n",
    "# average amounts of convective precipitation in the tropics with the shape\n",
    "# (number of years, number of days in a year, number of hours,\n",
    "# number of latitudes, number of longitudes);\n",
    "# the file contains hourly values depending on (y, d, h, lat, lon)\n",
    "#   y (axis 0) is the number of a year starting with 0 and ending with 42\n",
    "#              y = 0 roughly corresponds to 1979\n",
    "#              y = 42 roughly corresponds to 2021\n",
    "#              y values in [1, 41] correspond to 1980–2020\n",
    "#   d (axis 1) is the number of a day (an integer in [0, 364])\n",
    "#              each model year consists of 365 days\n",
    "#   h (axis 2) is the hour of the day (an integer in [0, 23])\n",
    "# lat (axis 3) describes the latitude (an integer in [0, 39]\n",
    "#              corresponding to 1.5° wide cells within 30°S–30°N)\n",
    "# lon (axis 4) describes the longitude (an integer in [0, 179]\n",
    "#              corresponding to 2° wide cells across each circle of latitude)\n",
    "\n",
    "inm_rain_data = np.load(\n",
    "    f\"{src_path}/INMCM-RAIN-TROP.npy\"\n",
    ")[inm_start_year:inm_end_year].flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "74fa8644-6b49-4000-8b10-880378a10b1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# retaining only the cells actually contributing to the IP\n",
    "\n",
    "wrf_cape_rain_data = wrf_cape_data[wrf_rain_data > 0]\n",
    "inm_scape_rain_data = inm_scape_data[inm_rain_data > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "340da2f8-f285-451b-b6be-86823dbc1d1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "899773291 0.0 7764.9863\n",
      "919183611 0.0 11888.347\n"
     ]
    }
   ],
   "source": [
    "for a in [wrf_cape_rain_data, inm_scape_rain_data]:\n",
    "    print(len(a), np.amin(a), np.amax(a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "02ae9453-3af6-4f0c-9269-d2ffd2b8118c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculating histogram parameters\n",
    "# (only the cells contributing to the IP are taken into account)\n",
    "\n",
    "wrf_cape_hist = np.histogram(wrf_cape_rain_data,\n",
    "                             bins=np.arange(0, 17001, 10))\n",
    "inm_scape_hist = np.histogram(inm_scape_rain_data,\n",
    "                              bins=np.arange(0, 17001, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "95908fbf-4985-4ef5-9066-ba3d094ae4f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# saving the data for plotting histograms in other scripts\n",
    "\n",
    "np.savez(\n",
    "    \"./data/WRF/WRF_CAPE_RAIN_HIST.npz\",\n",
    "    values=wrf_cape_hist[0], bins=wrf_cape_hist[1]\n",
    ")\n",
    "np.savez(\n",
    "    \"./data/INMCM/INMCM_SCAPE_RAIN_HIST.npz\",\n",
    "    values=inm_scape_hist[0], bins=inm_scape_hist[1]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ae9bccd6-b5f4-4f3a-8ccc-6134292fe963",
   "metadata": {},
   "outputs": [],
   "source": [
    "del wrf_cape_data, inm_scape_data\n",
    "del wrf_rain_data, inm_rain_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46d4f093-a420-42c7-b885-a8409d9d8ee4",
   "metadata": {},
   "source": [
    "### Preprocessing IP data from the INMCM and WRF: the usual parameterisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "052374df-a505-420a-aa69-2902ce5fc23b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dictionaries where the processed data are saved\n",
    "# dictionary keys represent CAPE threshold values\n",
    "\n",
    "# dictionaries to store diurnal average IP values summed over longitudes\n",
    "# the dimensions are (4992, 180) for the WRF and (365×41, 120) for the INMCM\n",
    "wrf_daily_lat_ip = {}\n",
    "inm_daily_lat_ip = {}\n",
    "\n",
    "# dictionaries to store hourly IP values summed over longitudes and latitudes\n",
    "# the dimensions are (4992, 24) for the WRF and (365×41, 24) for the INMCM\n",
    "wrf_hourly_total_ip = {}\n",
    "inm_hourly_total_ip = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d8e43c4f-59af-483c-8979-535c696abb4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# iterating over CAPE threshold values (in J/kg) used in modelling \n",
    "# for each threshold, there are corresponding model data sets\n",
    "\n",
    "for cape_thres in [600, 800, 1000, 1200]:\n",
    "    # `WRF-IP-CAPE-{cape_thres}.npy` contains WRF-simulated\n",
    "    # grid cell contributions to the IP with CAPE threshold = `cape_thres` J/kg\n",
    "    # (not normalised) with the shape\n",
    "    # (number of days, number of hours,\n",
    "    # number of latitudes, number of longitudes);\n",
    "    # the file contains hourly values depending on (d, h, lat, lon)\n",
    "    #   d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
    "    #              every third day is taken\n",
    "    #              d = 0 corresponds to 1 Jan 1980\n",
    "    #              d = 5386 corresponds to 28 Mar 2024\n",
    "    #              d = 4991 corresponds to 29 Dec 2020\n",
    "    #              (we will restrict our attention to 1980–2020)\n",
    "    #   h (axis 1) is the hour of the day (an integer in [0, 24])\n",
    "    #              the values corresponding to h = 0 and h = 24 are the same\n",
    "    #              (we delete the 25th value)\n",
    "    # lat (axis 2) describes the latitude (an integer in [0, 179]\n",
    "    #              corresponding to 1° wide cells within 90°S–90°N)\n",
    "    # lon (axis 3) describes the longitude (an integer in [0, 359]\n",
    "    #              corresponding to 1° wide cells across each circle of latitudes)\n",
    "\n",
    "    wrf_raw_ip_data = np.load(\n",
    "        f\"{src_path}/WRF-IP-CAPE-{cape_thres}.npy\"\n",
    "    )[:wrf_N_days, :24]\n",
    "\n",
    "    # normalising contributions to the IP to the global mean of 240 kV\n",
    "    wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
    "\n",
    "    # filling the dictionaries with averaged values\n",
    "    wrf_daily_lat_ip[cape_thres] = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "    wrf_hourly_total_ip[cape_thres] = wrf_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
    "    np.save(\n",
    "        f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
    "        wrf_hourly_total_ip[cape_thres]\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f8501fdb-7414-4ad7-a5d2-c361bef14297",
   "metadata": {},
   "outputs": [],
   "source": [
    "# iterating over CAPE threshold values (in J/kg) used in modelling \n",
    "# for each threshold, there are corresponding model data sets\n",
    "\n",
    "for cape_thres in [600, 800, 1000, 1200]:\n",
    "    # `INMCM-IP-CAPE-{cape_thres}.npy` contains INMCM-simulated\n",
    "    # grid cell contributions to the IP with CAPE threshold = `cape_thres` J/kg\n",
    "    # (not normalised) with the shape\n",
    "    # (number of years, number of days in a year, number of hours,\n",
    "    # number of latitudes, number of longitudes);\n",
    "    # the file contains hourly values depending on (y, d, h, lat, lon)\n",
    "    #   y (axis 0) is the number of a year starting with 0 and ending with 42\n",
    "    #              y = 0 roughly corresponds to 1979\n",
    "    #              y = 42 roughly corresponds to 2021\n",
    "    #              y values in [1, 41] correspond to 1980–2020\n",
    "    #   d (axis 1) is the number of a day (an integer in [0, 364])\n",
    "    #              each model year consists of 365 days\n",
    "    #   h (axis 2) is the hour of the day (an integer in [0, 23])\n",
    "    # lat (axis 3) describes the latitude (an integer in [0, 39]\n",
    "    #              corresponding to 1.5° wide cells within 30°S–30°N)\n",
    "    # lon (axis 4) describes the longitude (an integer in [0, 179]\n",
    "    #              corresponding to 2° wide cells across each circle of latitude)\n",
    "\n",
    "    inm_raw_ip_data = np.load(\n",
    "        f\"{src_path}/INMCM-IP-CAPE-{cape_thres}.npy\"\n",
    "    )[inm_start_year:inm_end_year].reshape(\n",
    "        (inm_N_days, 24, 120, 180)\n",
    "    )\n",
    "\n",
    "    # normalising contributions to the IP to the global mean of 240 kV\n",
    "    inm_raw_ip_data /= (1/240e3) * inm_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
    "\n",
    "    # filling the dictionaries with averaged values\n",
    "    inm_daily_lat_ip[cape_thres] = inm_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "    inm_hourly_total_ip[cape_thres] = inm_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
    "    np.save(\n",
    "        f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
    "        inm_hourly_total_ip[cape_thres]\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "eb28cbc7-eb0a-49be-8cc1-734bba1d06f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# iterating over CAPE threshold values (in J/kg) used in modelling \n",
    "# for each threshold, there are corresponding model data sets\n",
    "\n",
    "for cape_thres in [600, 800, 1000, 1200]:\n",
    "    # initialising arrays to store monthly averaged values\n",
    "    # for different latitudes\n",
    "    wrf_data_LATxMON = np.zeros((180, 12))\n",
    "    inm_data_LATxMON = np.zeros((120, 12))\n",
    "\n",
    "    # iterating over month numbers (starting with 0)\n",
    "    for month_idx in range(12):\n",
    "        # filtering indices by the month number\n",
    "        wrf_monthly_indices = [i for i, date in enumerate(wrf_dt_indices)\n",
    "                               if date.month == month_idx + 1]\n",
    "        inm_monthly_indices = [i for i, date in enumerate(inm_dt_indices)\n",
    "                               if date.month == month_idx + 1]\n",
    "\n",
    "        # putting the values for the specific month into the array\n",
    "        wrf_data_LATxMON[:, month_idx] = \\\n",
    "            wrf_daily_lat_ip[cape_thres][wrf_monthly_indices].mean(axis=0)\n",
    "        inm_data_LATxMON[:, month_idx] = \\\n",
    "            inm_daily_lat_ip[cape_thres][inm_monthly_indices].mean(axis=0)\n",
    "\n",
    "    np.save(\n",
    "        f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n",
    "        wrf_data_LATxMON\n",
    "    )\n",
    "    np.save(\n",
    "        f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n",
    "        inm_data_LATxMON\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "0bfbc02a-8d81-43a2-b60d-c190c6d78d13",
   "metadata": {},
   "outputs": [],
   "source": [
    "del wrf_raw_ip_data, inm_raw_ip_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "91bc6d7a-393c-4078-9a6d-1955393d55f5",
   "metadata": {},
   "source": [
    "### Preprocessing IP data from the WRF: the new parameterisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5bc66a8f-aa8f-4681-91a9-60c9fbdbf8f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `WRF-IP-CAPE-500-T2-LIN-25.npy` contains WRF-simulated\n",
    "# grid cell contributions to the IP with CAPE threshold = 500 J/kg\n",
    "# and temperature threshold = 25 °C (not normalised) with the shape\n",
    "# (number of days, number of hours,\n",
    "# number of latitudes, number of longitudes);\n",
    "# the file contains hourly values depending on (d, h, lat, lon)\n",
    "#   d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
    "#              every third day is taken\n",
    "#              d = 0 corresponds to 1 Jan 1980\n",
    "#              d = 5386 corresponds to 28 Mar 2024\n",
    "#              d = 4991 corresponds to 29 Dec 2020\n",
    "#              (we will restrict our attention to 1980–2020)\n",
    "#   h (axis 1) is the hour of the day (an integer in [0, 24])\n",
    "#              the values corresponding to h = 0 and h = 24 are the same\n",
    "#              (we delete the 25th value)\n",
    "# lat (axis 2) describes the latitude (an integer in [0, 179]\n",
    "#              corresponding to 1° wide cells within 90°S–90°N)\n",
    "# lon (axis 3) describes the longitude (an integer in [0, 359]\n",
    "#              corresponding to 1° wide cells across each circle of latitudes)\n",
    "\n",
    "wrf_raw_ip_data = np.load(\n",
    "    f\"{src_path}/WRF-IP-CAPE-500-T2-LIN-25.npy\"\n",
    ")[:wrf_N_days, :24]\n",
    "\n",
    "# normalising contributions to the IP to the global mean of 240 kV\n",
    "wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
    "\n",
    "# filling the dictionaries with averaged values\n",
    "wrf_daily_latitudal_ip = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "wrf_hourly_total_ip = wrf_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
    "np.save(\n",
    "    \"./data/WRF/WRF_HOURLY_TOTAL_IP_500_T2_25.npy\",\n",
    "    wrf_hourly_total_ip,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "17036c19-95f8-40df-a6c9-f8a23cf426f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialising an array to store monthly averaged values\n",
    "# for different latitudes\n",
    "wrf_data_LATxMON = np.zeros((180, 12))\n",
    "\n",
    "# iterating over month numbers (starting with 0)\n",
    "for month_idx in range(12):\n",
    "    # filtering indices by the month number\n",
    "    wrf_monthly_indices = [i for i, date in enumerate(wrf_dt_indices)\n",
    "                           if date.month == month_idx + 1]\n",
    "\n",
    "    # putting the values for the specific month into the array\n",
    "    wrf_data_LATxMON[:, month_idx] = \\\n",
    "        wrf_daily_latitudal_ip[wrf_monthly_indices].mean(axis=0)\n",
    "\n",
    "np.save(\"./data/WRF/WRF_IP_500_T2_25_LATxMON.npy\", wrf_data_LATxMON)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a9f4e443-31bd-4a82-ae8a-b447a9d0f2cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "del wrf_raw_ip_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e24297fc-cf81-4ea7-9a80-cdcaf277474a",
   "metadata": {},
   "source": [
    "### Saving the number of days for each month (used to compute mean values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "894ad630-17a5-4744-907e-a07768ff7848",
   "metadata": {},
   "outputs": [],
   "source": [
    "# saving the number of days for each month\n",
    "# necessary for correct averaging due to \n",
    "# different numbers of days in different months\n",
    "\n",
    "wrf_days = np.array([len([i for i, date in enumerate(wrf_dt_indices) \n",
    "                          if date.month == m + 1])\n",
    "                     for m in range(12)])\n",
    "inm_days = np.array([len([i for i, date in enumerate(inm_dt_indices) \n",
    "                          if date.month == m + 1])\n",
    "                     for m in range(12)])\n",
    "\n",
    "np.save(\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)\n",
    "np.save(\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)\n",
    "\n",
    "# to calculate the annual mean value, use\n",
    "# `(wrf_data_LATxMON[:, :].sum(axis=0) * days).sum() / days.sum()`\n",
    "# rather than\n",
    "# `wrf_data_LATxMON[:, :].sum(axis=0).mean()`,\n",
    "# since\n",
    "# `((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04edcb46-b3f9-491a-ba88-509d0fceaac5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}