Completed data preprocessing script commenting

1 year ago · 671cea448d
1 changed files with 195 additions and 138 deletions
--- a/0_prepare_data.ipynb
+++ b/0_prepare_data.ipynb
@ -18,7 +18,7 @@
				@@ -18,7 +18,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "7b2a7f44-b0cb-4471-a0c6-e56da23caf86",
   "metadata": {},
   "outputs": [],
@ -29,9 +29,17 @@
				@@ -29,9 +29,17 @@
    "import pandas as pd"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "6b2b903f-fa30-4e35-97e1-74bc0ee6b944",
+   "metadata": {},
+   "source": [
+    "### Helper variables"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "36b9f49e-32e6-4544-a9d3-f6a8ba49d867",
   "metadata": {},
   "outputs": [],
@ -41,29 +49,29 @@
				@@ -41,29 +49,29 @@
    "src_path = \"../shared_files/eee_public_files/seasonal-variation-2024/\""
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7",
-   "metadata": {},
-   "source": [
-    "### Preprocessing WRF T2m data"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "78a4350c-59fb-479a-b7cd-e2bf9b996d36",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# available numbers of simulated days for analysis\n",
+    "# used numbers of simulated days for analysis\n",
    "wrf_N_days = 4992\n",
    "inmcm_N_days = 3650"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "5e16ee8e-f3b0-4251-9691-19d7dfd4aff7",
+   "metadata": {},
+   "source": [
+    "### Preprocessing WRF T2m data"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "53cb9cc3-0e56-4da4-920b-2f071a0846fb",
   "metadata": {},
   "outputs": [],
@ -81,51 +89,48 @@
				@@ -81,51 +89,48 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": null,
   "id": "0e4d0268-208f-45c7-bb0a-b5e8d3c7c1f7",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(4992, 180, 360)"
-      ]
-     },
-     "execution_count": 147,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "wrf_T2_data = np.load(f\"{src_path}/T2-MAP-FULL.npy\")[:wrf_N_days]\n",
-    "wrf_T2_data.shape"
+    "# air temperature at the height of 2 m with the shape\n",
+    "# (number of days, number of latitudes, number of longitudes)\n",
+    "\n",
+    "# contains temperature values depending on (d, lat, lon)\n",
+    "\n",
+    "# d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
+    "# every third day is taken\n",
+    "# d = 0 corresponds to 1 Jan 1980, d = 5113 corresponds to 30 Dec 2021\n",
+    "# d = 4991 corresponds to 29 Dec 2020\n",
+    "# (we will restrict our attention to 1980–2020)\n",
+    "\n",
+    "# lat (axis 2) describes the latitude (an integer in [0, 179])\n",
+    "\n",
+    "# lon (axis 3) describes the longitude (an integer in [0, 359])\n",
+    "\n",
+    "wrf_T2_data = np.load(f\"{src_path}/T2-MAP-FULL.npy\")[:wrf_N_days]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": null,
   "id": "ec569ffd-93c2-4490-8ba1-69af4fab8f23",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# mean surface air temperature values for different latitudes and months\n",
+    "# air temperature averaged over latitudes and months\n",
    "wrf_mon_T2 = np.zeros((180, 12))\n",
    "\n",
    "for month_idx in range(12):\n",
+    "    # filter indicies by month number\n",
    "    monthly_indicies = [\n",
    "        i for i, date in enumerate(wrf_dt_indicies) if date.month == month_idx + 1\n",
-    "    ]  # indicies of days available for `month_idx+1` month\n",
+    "    ]\n",
+    "\n",
+    "    # putting values at specific month into averaged array\n",
+    "    wrf_mon_T2[:, month_idx] = wrf_mean_T2[monthly_indicies].mean(axis=0)\n",
    "\n",
-    "    wrf_mon_T2[:, month_idx] = wrf_mean_T2[monthly_indicies].mean(axis=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 164,
-   "id": "08302aa4-cb14-47f9-8216-a9db06ae53ef",
-   "metadata": {},
-   "outputs": [],
-   "source": [
    "np.save(f\"./data/WRF/WRF_T2_LATxMON.npy\",wrf_mon_T2)"
   ]
  },
@ -139,102 +144,123 @@
				@@ -139,102 +144,123 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": null,
   "id": "94a603c3-982d-4c78-be1c-bb6c74b86b5b",
   "metadata": {},
   "outputs": [],
   "source": [
+    "# dictionaries where processed data is saved\n",
+    "# the dictionary keys represent the threshold value of CAPE\n",
+    "\n",
+    "# for storing arrays with averaged by hours and summarized by longitude,\n",
+    "# i.e. with dimensions (4992, 180) for WRF and (3650, 120) for INMCM\n",
    "wrf_daily_latitudal_ip = {}\n",
    "inmcm_daily_latitudal_ip = {}\n",
    "\n",
+    "# for storing arrays summarized by longitude and latitude,\n",
+    "# i.e. with dimensions (4992, 24) for WRF and (3650, 24) for INMCM\n",
    "wrf_hourly_total_ip = {}\n",
    "inmcm_hourly_total_ip = {}"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": null,
   "id": "d8e43c4f-59af-483c-8979-535c696abb4e",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "800\n",
-      "1000\n",
-      "1200\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "for cape_thres in [800, 1000, 1200]: # J/kg\n",
-    "    print(cape_thres)\n",
+    "# iterating over the CAPE threshold (J/kg) values used in modeling \n",
+    "# for each threshold, there are corresponding model datasets\n",
+    "for cape_thres in [800, 1000, 1200]:\n",
    "\n",
    "    # grid cell contributions to the IP (not normalised) with the shape\n",
    "    # (number of days, number of hours, number of latitudes, number of longitudes)\n",
    "    wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-{cape_thres}.npy\")[:wrf_N_days]\n",
+    "    # modelled using WRF model with CAPE threshold = `cape_thres` J/kg\n",
+    "    # contains values of contributions to the IP depending on (d, h, lat, lon)\n",
+    "    #   d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
+    "    #              every third day is taken\n",
+    "    #              d = 0 corresponds to 1 Jan 1980, \n",
+    "    #              d = 5113 corresponds to 30 Dec 2021\n",
+    "    #              d = 4991 corresponds to 29 Dec 2020\n",
+    "    #              (we will restrict our attention to 1980–2020)\n",
+    "    #   h (axis 1) is the hour of the day (an integer in [0, 24])\n",
+    "    #              the values corresponding to h = 0 and h = 24 are the same\n",
+    "    # lat (axis 2) describes the latitude (an integer in [0, 179]) \n",
+    "    # lon (axis 3) describes the longitude (an integer in [0, 359])\n",
+    "\n",
+    "    # discarding the last hour, which duplicates the first one\n",
    "    wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n",
+    "    \n",
+    "    # normalisation of contributions to the IP to the global mean of 240 kV\n",
    "    wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
    "\n",
+    "    # filling dictionaries with averaged arrays\n",
    "    wrf_daily_latitudal_ip[cape_thres] = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "    wrf_hourly_total_ip[cape_thres] = wrf_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
-    "    inmcm_raw_ip_data = np.load(f\"{src_path}/INMCM-IP-MAP-{cape_thres}.npy\").reshape((inmcm_N_days, 24, 120, 180))\n",
+    "    np.save(f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
+    "            wrf_hourly_total_ip[cape_thres])\n",
+    "\n",
+    "    # grid cell contributions to the IP (not normalised) reshaped to\n",
+    "    # (number of days, number of hours, number of latitudes, number of longitudes)\n",
+    "    inmcm_raw_ip_data = np.load(f\"{src_path}/INMCM-IP-MAP-{cape_thres}.npy\")\\\n",
+    "                          .reshape((inmcm_N_days, 24, 120, 180))\n",
+    "    # modelled using INMCM model with CAPE threshold = `cape_thres` J/kg\n",
+    "    # contains values of contributions to the IP depending on (d, h, lat, lon)\n",
+    "    #   d (axis 0) is the number of a day (not correspond to real days,\n",
+    "    #              10 consecutive 365-day years have been simulated)\n",
+    "    #   h (axis 1) is the hour of the day (an integer in [0, 23])\n",
+    "    # lat (axis 2) describes the latitude (an integer in [0, 179]) \n",
+    "    # lon (axis 3) describes the longitude (an integer in [0, 359])\n",
+    "\n",
+    "    # normalisation of contributions to the IP to the global mean of 240 kV\n",
    "    inmcm_raw_ip_data /= (1/240e3) * inmcm_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
-    "    \n",
+    "\n",
+    "    # filling dictionaries with averaged arrays\n",
    "    inmcm_daily_latitudal_ip[cape_thres] = inmcm_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "    inmcm_hourly_total_ip[cape_thres] = inmcm_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
-    "    del wrf_raw_ip_data\n",
-    "    del inmcm_raw_ip_data"
+    "    np.save(f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
+    "            inmcm_hourly_total_ip[cape_thres])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": null,
   "id": "eb28cbc7-eb0a-49be-8cc1-734bba1d06f5",
   "metadata": {},
   "outputs": [],
   "source": [
-    "for cape_thres in [800, 1000, 1200]:  # J/kg\n",
-    "    np.save(\n",
-    "        f\"./data/INMCM/INMCM_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
-    "        inmcm_hourly_total_ip[cape_thres],\n",
-    "    )\n",
-    "    np.save(\n",
-    "        f\"./data/WRF/WRF_HOURLY_TOTAL_IP_{cape_thres}.npy\",\n",
-    "        wrf_hourly_total_ip[cape_thres],\n",
-    "    )\n",
+    "# iterating over the CAPE threshold (J/kg) values used in modeling \n",
+    "# for each threshold, there are corresponding model datasets\n",
+    "for cape_thres in [800, 1000, 1200]:\n",
    "\n",
+    "    # initialization of an arrays to store time-averaged data over months\n",
    "    wrf_data_LATxMON = np.zeros((180, 12))\n",
    "    inmcm_data_LATxMON = np.zeros((120, 12))\n",
    "\n",
+    "    # iteration over month number (starting with 0)\n",
    "    for month_idx in range(12):\n",
-    "        monthly_indicies = [\n",
-    "            i for i, date in enumerate(wrf_dt_indicies) if date.month == month_idx + 1\n",
-    "        ]  # indicies of days available for `month_idx+1` month\n",
-    "\n",
-    "        wrf_data_MONxLAT[:, month_idx] = wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(\n",
-    "            axis=0\n",
-    "        )\n",
-    "    np.save(\n",
-    "        f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n",
-    "        wrf_data_MONxLAT,\n",
-    "    )\n",
    "\n",
-    "    for month_idx in range(12):\n",
-    "        monthly_indicies = [\n",
-    "            i for i, date in enumerate(inmcm_dt_indicies) if date.month == month_idx + 1\n",
-    "        ]  # indicies of days available for `month_idx+1` month\n",
-    "\n",
-    "        inmcm_data_LATxMON[:, month_idx] = inmcm_daily_latitudal_ip[cape_thres][\n",
-    "            monthly_indicies\n",
-    "        ].mean(axis=0)\n",
-    "    np.save(\n",
-    "        f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n",
-    "        inmcm_data_LATxMON,\n",
-    "    )"
+    "        # filtering day indices belonging to a specific month\n",
+    "        wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n",
+    "                                if date.month == month_idx + 1]\n",
+    "        inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n",
+    "                                if date.month == month_idx + 1]\n",
+    "\n",
+    "        # filling with modeling values with a CAPE threshold \n",
+    "        # averaged over months of the year\n",
+    "        wrf_data_MONxLAT[:, month_idx] = \\\n",
+    "            wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n",
+    "        inmcm_data_LATxMON[:, month_idx] = \\\n",
+    "            inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n",
+    "\n",
+    "    np.save(f\"./data/WRF/WRF_IP_{cape_thres}_LATxMON.npy\",\n",
+    "            wrf_data_MONxLAT)\n",
+    "    np.save(f\"./data/INMCM/INMCM_IP_{cape_thres}_LATxMON.npy\",\n",
+    "            inmcm_data_LATxMON)"
   ]
  },
  {
@ -252,10 +278,30 @@
				@@ -252,10 +278,30 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# grid cell contributions to the IP (not normalised) reshaped to\n",
+    "# (number of days, number of hours, number of latitudes, number of longitudes)\n",
    "wrf_raw_ip_data = np.load(f\"{src_path}/WRF-IP-MAP-500-T2-25.npy\")[:wrf_N_days]\n",
+    "# modelled using WRF model using new parametrization based on\n",
+    "# CAPE and T2 with corresponding thresholds 500 J/kg and 25°C.\n",
+    "# contains values of contributions to the IP depending on (d, h, lat, lon)\n",
+    "#   d (axis 0) is the number of a day starting with 0 and ending with 5113\n",
+    "#              every third day is taken\n",
+    "#              d = 0 corresponds to 1 Jan 1980, \n",
+    "#              d = 5113 corresponds to 30 Dec 2021\n",
+    "#              d = 4991 corresponds to 29 Dec 2020\n",
+    "#              (we will restrict our attention to 1980–2020)\n",
+    "#   h (axis 1) is the hour of the day (an integer in [0, 24])\n",
+    "#              the values corresponding to h = 0 and h = 24 are the same\n",
+    "# lat (axis 2) describes the latitude (an integer in [0, 179]) \n",
+    "# lon (axis 3) describes the longitude (an integer in [0, 359])\n",
+    "\n",
+    "# discarding the last hour, which duplicates the first one\n",
    "wrf_raw_ip_data = wrf_raw_ip_data[:, :24, :, :]\n",
+    "\n",
+    "# normalisation of contributions to the IP to the global mean of 240 kV\n",
    "wrf_raw_ip_data /= (1/240e3) * wrf_raw_ip_data.sum(axis=(-2,-1)).mean()\n",
    "\n",
+    "# filling dictionaries with averaged arrays\n",
    "wrf_daily_latitudal_ip = wrf_raw_ip_data.mean(axis=1).sum(axis=-1)\n",
    "wrf_hourly_total_ip = wrf_raw_ip_data.sum(axis=(-2, -1))\n",
    "\n",
@ -267,27 +313,53 @@
				@@ -267,27 +313,53 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
+   "id": "5797fffa-a795-4241-a574-6c95f0195a5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterating over the CAPE threshold (J/kg) values used in modeling \n",
+    "# for each threshold, there are corresponding model datasets\n",
+    "for cape_thres in [800, 1000, 1200]:\n",
+    "\n",
+    "    # initialization of an arrays to store time-averaged data over months\n",
+    "    wrf_data_LATxMON = np.zeros((180, 12))\n",
+    "    inmcm_data_LATxMON = np.zeros((120, 12))\n",
+    "\n",
+    "    # iteration over month number (starting with 0)\n",
+    "    for month_idx in range(12):\n",
+    "\n",
+    "        # filtering day indices belonging to a specific month\n",
+    "        wrf_monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies) \n",
+    "                                if date.month == month_idx + 1]\n",
+    "        inm_monthly_indicies = [i for i, date in enumerate(inmcm_dt_indicies) \n",
+    "                                if date.month == month_idx + 1]\n",
+    "\n",
+    "        # filling with modeling values with a CAPE threshold \n",
+    "        # averaged over months of the year\n",
+    "        wrf_data_MONxLAT[:, month_idx] = \\\n",
+    "            wrf_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)\n",
+    "        inmcm_data_LATxMON[:, month_idx] = \\\n",
+    "            inmcm_daily_latitudal_ip[cape_thres][monthly_indicies].mean(axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "id": "17036c19-95f8-40df-a6c9-f8a23cf426f6",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "4992\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
+    "# initialization of a array to store time-averaged data over months\n",
    "wrf_data_LATxMON = np.zeros((180, 12))\n",
    "\n",
+    "# iteration over month number (starting with 0)\n",
    "for month_idx in range(12):\n",
-    "    monthly_indicies = [\n",
-    "        i for i, date in enumerate(wrf_dt_indicies) \n",
-    "        if date.month == month_idx + 1\n",
-    "    ]\n",
+    "    # filtering day indices belonging to a specific month\n",
+    "    monthly_indicies = [i for i, date in enumerate(wrf_dt_indicies)\n",
+    "                        if date.month == month_idx + 1]\n",
    "\n",
+    "    # filling with modeling values averaged over months of the year\n",
    "    wrf_data_LATxMON[:, month_idx] = \\\n",
    "        wrf_daily_latitudal_ip[monthly_indicies].mean(axis=0)\n",
    "\n",
@ -304,47 +376,32 @@
				@@ -304,47 +376,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
-   "id": "6dfdae16-cb07-4580-97ba-414b5b2c1f2b",
+   "execution_count": null,
+   "id": "894ad630-17a5-4744-907e-a07768ff7848",
   "metadata": {},
   "outputs": [],
   "source": [
-    "wrf_days = np.array([len([\n",
-    "        i for i, date in enumerate(wrf_dt_indicies) \n",
-    "        if date.month == m + 1\n",
-    "    ]) for m in range(12)])\n",
+    "# saving the number of days for each month\n",
+    "# necessary for correct averaging due to \n",
+    "# different numbers of days in different months\n",
    "\n",
-    "np.save(f\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "762dd08f-7d1e-4b68-b272-e7aa9b26d9f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "inm_days = np.array([len([\n",
-    "        i for i, date in enumerate(inmcm_dt_indicies) \n",
-    "        if date.month == m + 1\n",
-    "    ]) for m in range(12)])\n",
+    "wrf_days = np.array([len([i for i, date in enumerate(wrf_dt_indicies) \n",
+    "                          if date.month == m + 1]) \n",
+    "                     for m in range(12)])\n",
+    "\n",
+    "inm_days = np.array([len([i for i, date in enumerate(inmcm_dt_indicies) \n",
+    "                          if date.month == m + 1]) \n",
+    "                     for m in range(12)])\n",
+    "\n",
+    "np.save(f\"./data/WRF/WRF_NUMDAYS_MON.npy\", wrf_days)\n",
+    "np.save(f\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)\n",
    "\n",
-    "np.save(f\"./data/INMCM/INMCM_NUMDAYS_MON.npy\", inm_days)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "78afb8e1-3975-4e19-993f-1b0c74c3e9bd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
    "# for average over months use\n",
    "# `(wrf_data_LATxMON[:, :].sum(axis=0)*days).sum()/days.sum()`\n",
    "# unstead\n",
    "# `wrf_data_LATxMON[:, :].sum(axis=0).mean()`\n",
    "# because\n",
-    "# ((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5"
+    "# `((a1+a2+a3)/3 + (b1+b2)/2)/2 != (a1+a2+a3+b1+b2)/5`"
   ]
  }
 ],