@ -5,8 +5,8 @@
@@ -5,8 +5,8 @@
"id": "113058f9-8324-4368-99ed-b088987c683a",
"metadata": {},
"source": [
"# New and earlier Vostok datasets \n",
"Seasonal-diurnal diagram, hour crossections, adjustment"
"# Comprehensive analysis of Vostok station data \n",
"Seasonal-diurnal diagram, hour crossections, adjustment using meteoparameters "
]
},
{
@ -25,12 +25,14 @@
@@ -25,12 +25,14 @@
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import datetime as dt\n",
"import matplotlib.transforms as tf\n",
"from matplotlib import cm, colors, colormaps\n",
"import scipy.stats as st"
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"\n",
"import datetime as dt"
]
},
{
@ -38,7 +40,7 @@
@@ -38,7 +40,7 @@
"id": "eb2d634e-ccc5-4fd2-a9e8-05d314f12c1b",
"metadata": {},
"source": [
"### S ource PG datasets"
"### Loading s ource PG datasets"
]
},
{
@ -48,33 +50,81 @@
@@ -48,33 +50,81 @@
"metadata": {},
"outputs": [],
"source": [
"# potential gradient values measured at Vostok station in (1998–2001)\n",
"# units are V·m^(−1)\n",
"# source: Burns et al. (2012), Table 2\n",
"\n",
"vostok_old_data = (\n",
" np.array([195, 201, 205, 192, 188, 195,\n",
" 209, 198, 209, 195, 193, 192])\n",
")\n",
"\n",
"# potential gradient values measured at Vostok station in (1998–2001)\n",
"# units are V·m^(−1)\n",
"# source: Burns et al. (2012), Table 2"
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7de3455e-ba80-420d-8fec-8f82ee28804a ",
"id": "c63b60de-82fa-4dcd-af67-a3d4738a0431 ",
"metadata": {},
"outputs": [],
"source": [
"df_10s = pd.read_csv('./data/vostok_hourly_from_10_s_without_calibration_and_empty.tsv', sep='\\t', parse_dates=['Datetime']).set_index('Datetime')\n",
"df_10s[\"Mark\"] = '10s'\n",
"# Load hourly data derived from 10-second intervals \n",
"# \"Field\" column contains PG values [V/m] without scale and calibration\n",
"df_10s = pd.read_csv('./data/vostok_hourly_from_10_s_without_calibration_and_empty.tsv', \n",
" sep='\\t', parse_dates=['Datetime']).set_index('Datetime')\n",
"\n",
"df_5min = pd.read_csv('./data/vostok_hourly_from_5_min_without_calibration_and_empty.tsv', sep='\\t', parse_dates=['Datetime']).set_index('Datetime')\n",
"df_5min[\"Mark\"] = '5min'\n",
"\n",
"# combine dataframes: fill 10s averaged hours gaps with 10min averaged hours\n",
"df_src = df_10s.combine_first(df_5min)\n",
"# Add a new column 'Mark' to label this dataframe's data as originating from 10-second averages\n",
"df_10s[\"Mark\"] = '10s' "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d2d83b75-97ce-42ba-8a99-8597f84d7946",
"metadata": {},
"outputs": [],
"source": [
"# Load hourly data derived from 5-minute intervals \n",
"# \"Field\" column contains PG values [V/m] without scale and calibration\n",
"df_5min = pd.read_csv('./data/vostok_hourly_from_5_min_without_calibration_and_empty.tsv', \n",
" sep='\\t', parse_dates=['Datetime']).set_index('Datetime')\n",
"\n",
"earlier_df_src = pd.read_csv('./data/vostok_1998_2004_hourly_80percent_all.tsv', sep='\\t', parse_dates=['Datetime']).set_index('Datetime')"
"# Add a new column 'Mark' to label this dataframe's data as originating from 5-minute averages\n",
"df_5min[\"Mark\"] = '5min' "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "26da1848-ad4f-4e58-8a75-d02428fb12c7",
"metadata": {},
"outputs": [],
"source": [
"# Load earlier dataset (1998-2004) \n",
"earlier_df_src = pd.read_csv('./data/vostok_1998_2004_hourly_80percent_all.tsv', \n",
" sep='\\t', parse_dates=['Datetime']).set_index('Datetime')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d126b8b1-0b66-4817-a556-8f33860608ae",
"metadata": {},
"outputs": [],
"source": [
"# Note that all the aforementioned files are averaged with 80% data availability, which \n",
"# means that the hourly value was calculated only when at least 80% of the readings were \n",
"# available (at least 288 10-second averages or at least 9 5-minute averages)."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7de3455e-ba80-420d-8fec-8f82ee28804a",
"metadata": {},
"outputs": [],
"source": [
"# Combine the two dataframes: fill in gaps in the 10-second data with the 5-minute data.\n",
"df_src = df_10s.combine_first(df_5min)"
]
},
{
@ -87,7 +137,7 @@
@@ -87,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 4 ,
"execution_count": 8 ,
"id": "42979690-97b5-4e86-8777-db48896539ac",
"metadata": {},
"outputs": [],
@ -96,9 +146,11 @@
@@ -96,9 +146,11 @@
" \"\"\"\n",
" Estimate the standard error from the average value\n",
" and the average value of the square.\n",
" \n",
" :param avg_val: the average value\n",
" :param avg_sqr: the average square value\n",
" :param counter: the size of the sample\n",
" \n",
" :return: the standard error\n",
" \"\"\"\n",
"\n",
@ -107,40 +159,52 @@
@@ -107,40 +159,52 @@
},
{
"cell_type": "code",
"execution_count": 5 ,
"execution_count": 9 ,
"id": "b13f968c-f9db-4b05-86ae-0e997383f9c1",
"metadata": {},
"outputs": [],
"source": [
"def pass_fair_weather(df, scale_factor=None):\n",
" # form factor equal to 3 (remove metal pole influence)\n",
" df[\"Field\"] = df[\"Field\"] / scale_factor\n",
" \"\"\"\n",
" Filter and adjust potential gradient (PG) data to represent fair weather conditions.\n",
"\n",
" :param df: The input dataframe containing 'Field' values for hourly averaged potential gradient (PG) data\n",
" and 'Count' values indicating the number of valid measurements per day.\n",
" :param scale_factor: A factor used to normalize the 'Field' values to mitigate external influences\n",
" such as nearby metal structures (default is None, implying no scaling).\n",
"\n",
" :return: A dataframe with hourly data filtered to likely represent fair weather conditions only.\n",
" \"\"\"\n",
"\n",
" # normalize 'Field' values by the scale_factor if provided.\n",
" if scale_factor:\n",
" df[\"Field\"] = df[\"Field\"] / scale_factor\n",
" \n",
" # remove days with negative or zero hourly PG value\n",
" # exclude records with non-positive PG values \n",
" df = df[df[\"Field\"] > 0]\n",
" \n",
" # remove days with PG > 300 V/m\n",
" # exclude records with PG values > 300 V/m\n",
" df = df[df[\"Field\"] <= 300]\n",
" \n",
" # remove days with incomplete or absent hourly data\n",
" # retain only days with complete hourly data (24 counts) \n",
" df[\"Count\"] = df[\"Field\"].resample('D').transform('count')\n",
" df = df[df[\"Count\"] == 24]\n",
" \n",
" # calculate diurnal field characteristics\n",
" # compute diurnal PG stat s\n",
" df['Emax'] = df['Field'].resample('D').transform('max')\n",
" df['Emin'] = df['Field'].resample('D').transform('min')\n",
" df['Emean'] = df['Field'].resample('D').transform('mean')\n",
" \n",
" # remove days with relative p-p amplitude more than 150% of diurnal mean\n",
" # exclude days with peak-to-peak amplitude over 150% of the daily mean\n",
" df['Var'] = ((df[\"Emax\"]-df[\"Emin\"]))/df[\"Emean\"]\n",
" df = df[df['Var'] <= 1.5]\n",
" \n",
" # debugging code to calculate the number of hours \n",
" # # debug output for '5min' marked data counts per year \n",
" # (averaged from 5-min data) included in individual years \n",
" # z = df[df.Mark == '5min']\n",
" # print(z.groupby(z.index.year).count()[\"Mark\"])\n",
" \n",
" # remove temporary keys from dataframe \n",
" # remove temporary columns used for calculations \n",
" df = df[[\"Field\"]]\n",
"\n",
" return df"
@ -148,16 +212,30 @@
@@ -148,16 +212,30 @@
},
{
"cell_type": "code",
"execution_count": 6 ,
"execution_count": 10 ,
"id": "350d271c-0337-4e2b-89df-f90a42671d5c",
"metadata": {},
"outputs": [],
"source": [
"def calculate_seasonal_var_params(cond_df, key=\"Field\"):\n",
" # monthly mean of fair weather PG\n",
" \"\"\"\n",
" Calculate parameters necessary for assessing the seasonal \n",
" variability of fair weather potential gradients (FW PG).\n",
"\n",
" :param cond_df: DataFrame containing conditioned data with a datetime \n",
" index and at least one numerical column.\n",
" :param key: Column label in 'cond_df' which contains the potential \n",
" gradient data (default is 'Field').\n",
"\n",
" :return: A list containing arrays of:\n",
" monthly mean PG values, \n",
" counts of fair weather days, and \n",
" adjusted monthly mean square PG values.\n",
" \"\"\"\n",
" # compute the monthly mean of fair weather PG\n",
" mon_pg = cond_df[key].groupby(cond_df.index.month).mean().to_numpy().flatten()\n",
"\n",
" # count fair weather days in individual months\n",
" # count the number of fair weather days for each month \n",
" mon_counter = cond_df.groupby(cond_df.index.month).count().to_numpy().flatten()\n",
"\n",
" # sum((daily mean FW PG)**2)/(count FW days) over months\n",
@ -168,7 +246,7 @@
@@ -168,7 +246,7 @@
},
{
"cell_type": "code",
"execution_count": 7 ,
"execution_count": 11 ,
"id": "89b9f394-8d80-4db6-bd44-0b9778b03f9d",
"metadata": {},
"outputs": [],
@ -186,16 +264,35 @@
@@ -186,16 +264,35 @@
},
{
"cell_type": "code",
"execution_count": 8 ,
"execution_count": 12 ,
"id": "d1f85dec-45b7-47b2-b421-d528776fff39",
"metadata": {},
"outputs": [],
"source": [
"# apply the fair weather filter to the newdataset with a scale factor of 3 to normalize field values.\n",
"df = pass_fair_weather(df_src, scale_factor=3)\n",
"\n",
"# apply the fair weather filter to earlier dataset data with a scale factor of 1 for normalization.\n",
"earlier_df = pass_fair_weather(earlier_df_src, scale_factor=1)\n",
"\n",
"# or for equal mean values of FW histograms from different datasets:\n",
"# earlier_df = pass_fair_weather(earlier_df, scale_factor=1.35) "
"# Optional: Adjust the scale factor to 1.35 for earlier data to equalize the mean values of FW histograms across datasets.\n",
"# earlier_df = pass_fair_weather(earlier_df, scale_factor=1.35)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fbce521b-c249-4441-b962-9a7991fb521f",
"metadata": {},
"outputs": [],
"source": [
"# # debug figure for check the mean values of FW histograms\n",
"# fig, ax = plt.subplots(1,1)\n",
"# df.hist(ax=ax,bins=600, color='red')\n",
"# earlier_df.hist(ax=ax,bins=600, color='blue')\n",
"# plt.xlim(-200,500)\n",
"# plt.xlabel('V/m')\n",
"# plt.ylabel('Counts')"
]
},
{
@ -208,31 +305,18 @@
@@ -208,31 +305,18 @@
},
{
"cell_type": "code",
"execution_count": 9 ,
"execution_count": 14 ,
"id": "c339ed78-e4ba-43a9-89b8-85e8a0c57236",
"metadata": {},
"outputs": [],
"source": [
"# calculate daily mean of fair weather PG\n",
"# resample the filtered data to calculate the daily mean of FW PG\n",
"# then drop any days with missing data to ensure completeness of the dataset\n",
"\n",
"daily_df = df.resample('D').mean().dropna()\n",
"daily_edf = earlier_df.resample('D').mean().dropna()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "fbce521b-c249-4441-b962-9a7991fb521f",
"metadata": {},
"outputs": [],
"source": [
"# fig, ax = plt.subplots(1,1)\n",
"# df.hist(ax=ax,bins=600, color='red')\n",
"# earlier_df.hist(ax=ax,bins=600, color='blue')\n",
"# plt.xlim(-200,500)\n",
"# plt.xlabel('V/m')\n",
"# plt.ylabel('Counts')"
]
},
{
"cell_type": "markdown",
"id": "e5a58975-053a-4162-bccb-f0dbef39b0ed",
@ -243,13 +327,14 @@
@@ -243,13 +327,14 @@
},
{
"cell_type": "code",
"execution_count": 11 ,
"execution_count": 15 ,
"id": "6d6b5a71-6dd3-4471-b081-f3606c7b6956",
"metadata": {},
"outputs": [],
"source": [
"# calculate seasonal var params (avg FW PG, count FW days, \n",
"# sum of squares of FW PG) for specific years\n",
"# calculate seasonal variation parameters for specific years: \n",
"# average FW PG, count of FW days, and the sum of squares of FW PG hourly values.\n",
"\n",
"data, data_counter, data_sqr = np.array(\n",
" [\n",
" calculate_seasonal_var_params(daily_df),\n",
@ -263,7 +348,7 @@
@@ -263,7 +348,7 @@
},
{
"cell_type": "code",
"execution_count": 12 ,
"execution_count": 16 ,
"id": "4e29c652-918d-47e2-ac67-1d0c2a898eea",
"metadata": {},
"outputs": [
@ -324,7 +409,7 @@
@@ -324,7 +409,7 @@
" connectionstyle='arc3,rad=0.', fc='black',\n",
" linewidth=0.5\n",
" ))\n",
" # ampl = (np.max(data[n]) - np.min(data[n])) / np.mean(data[n]) \n",
"\n",
" ampl = (np.max(data[n]) - np.min(data[n])) / \\\n",
" np.sum(data[n] * data_counter[n]) * np.sum(data_counter[n])\n",
" ax[n].text(12.2, (np.min(data[n]) + np.max(data[n])) / 2,\n",
@ -366,11 +451,14 @@
@@ -366,11 +451,14 @@
},
{
"cell_type": "code",
"execution_count": 13 ,
"execution_count": 17 ,
"id": "19eddef4-207d-4fcd-af6b-fc3fcd02e7cd",
"metadata": {},
"outputs": [],
"source": [
"# create a seasonal-diurnal array by grouping the data by hour of the day and month, \n",
"# then calculating the mean FW PG for each combination.\n",
"\n",
"sd_df = df.groupby([df.index.hour, df.index.month]).mean()\n",
"sd_df.index.set_names(['hour', 'month'], inplace=True)\n",
"\n",
@ -380,7 +468,7 @@
@@ -380,7 +468,7 @@
},
{
"cell_type": "code",
"execution_count": 14 ,
"execution_count": 18 ,
"id": "5b03d7c6-e23e-45a9-8188-024d21851176",
"metadata": {},
"outputs": [],
@ -402,7 +490,7 @@
@@ -402,7 +490,7 @@
},
{
"cell_type": "code",
"execution_count": 15 ,
"execution_count": 19 ,
"id": "762c01a9-2ee5-428c-b331-ef44d1801127",
"metadata": {},
"outputs": [
@ -536,23 +624,18 @@
@@ -536,23 +624,18 @@
"pos = (width + wsep1 + widthM) / 2 / width\n",
"lbl.set_position((pos, 1.))\n",
"\n",
"pg_bound = np.arange(90, 191, 10)\n",
"# the bounds of the colour bars (in V/m)\n",
"pg_bound = np.arange(90, 191, 10)\n",
"\n",
"# colour_hour = [colormaps.get_cmap('inferno')(s / 11) for s in range(12)] \n",
"# the colours \n",
"colour_hour = np.array([[163, 163, 255], [71, 71, 255], [1, 14, 236],\n",
" [4, 79, 149], [6, 143, 63], [49, 192, 13],\n",
" [142, 220, 7], [234, 249, 1], [255, 184, 0],\n",
" [255, 92, 0], [255, 0, 0]]) / 255\n",
"# the colours\n",
"\n",
"# pg_cmap = colors.ListedColormap(colour_hour[:-1])\n",
"pg_cmap = colors.ListedColormap(colour_hour[1:])\n",
"pg_norm = colors.BoundaryNorm(pg_bound, pg_cmap.N)\n",
"\n",
"# pg_cmap = 'inferno'\n",
"# pg_norm = colors.Normalize(vmin=pg_bound[0], vmax=pg_bound[-1])\n",
"\n",
"pg_cax = fig.add_axes(\n",
" (\n",
" (wsp - wshift) / tot_width,\n",
@ -625,7 +708,7 @@
@@ -625,7 +708,7 @@
" connectionstyle='arc3,rad=0.', fc='black',\n",
" linewidth=0.5\n",
" ))\n",
" # ampl = (np.max(data[n]) - np.min(data[n])) / np.mean(data[n]) \n",
"\n",
" ampl = (np.max(data[n]) - np.min(data[n])) / \\\n",
" np.sum(data[n] * data_counter[n]) * np.sum(data_counter[n])\n",
" axB[n].text(12.2, (np.min(data[n]) + np.max(data[n])) / 2,\n",
@ -662,11 +745,15 @@
@@ -662,11 +745,15 @@
},
{
"cell_type": "code",
"execution_count": 17 ,
"execution_count": 20 ,
"id": "d16e0930-f81f-4288-a7a9-2f3942194f81",
"metadata": {},
"outputs": [],
"source": [
"# load and filter meteorological data from CSV files into separate dataframes, \n",
"# then join them with the daily FW PG dataframe to create a combined dataframe\n",
"# without missing values\n",
"\n",
"temp_df = pd.read_csv('./data/vostok_daily_temp.csv', parse_dates=['UTC']).set_index('UTC')\n",
"temp_df = temp_df[temp_df[\"COUNT\"] == 4][['T']]\n",
"\n",
@ -681,11 +768,15 @@
@@ -681,11 +768,15 @@
},
{
"cell_type": "code",
"execution_count": 18 ,
"execution_count": 2 1,
"id": "c45bdf6b-d725-4d87-b033-d90fa4f3f9a9",
"metadata": {},
"outputs": [],
"source": [
"# calculate anomalies for potential gradient (pg_anom), temperature\n",
"# (temp_anom), wind speed (wind_anom), and pressure (pres_anom)\n",
"# based on a moving average window of +/-10 days.\n",
"\n",
"pg_anom = np.zeros((len(meteo_df.index)))\n",
"temp_anom = np.zeros((len(meteo_df.index)))\n",
"wind_anom = np.zeros((len(meteo_df.index)))\n",
@ -714,7 +805,7 @@
@@ -714,7 +805,7 @@
},
{
"cell_type": "code",
"execution_count": 19 ,
"execution_count": 22 ,
"id": "326311ea-74ce-4de1-ac7a-e30e173fc8d8",
"metadata": {},
"outputs": [
@ -750,6 +841,9 @@
@@ -750,6 +841,9 @@
}
],
"source": [
"# calculate linear regression coefficients for temperature, wind speed \n",
"# and pressure anomalies against potential gradient anomalies.\n",
"\n",
"temp_coeffs = np.polyfit(temp_anom, pg_anom, deg=1)\n",
"wind_coeffs = np.polyfit(wind_anom, pg_anom, deg=1)\n",
"pres_coeffs = np.polyfit(pres_anom, pg_anom, deg=1)\n",
@ -763,8 +857,8 @@
@@ -763,8 +857,8 @@
" P = len(meteo_df.index) # the number of points\n",
" a = 0.01 # significance level\n",
"\n",
" q = st.t.ppf(1 - a / 2, P - 2)\n",
" r = q / np.sqrt(q**2 + P - 2)\n",
" q = st.t.ppf(1 - a / 2, P - 2) # critical value \n",
" r = q / np.sqrt(q**2 + P - 2) # threshold correlation coefficient \n",
"\n",
" corr = np.corrcoef(anom, pg_anom)[0, 1]\n",
" print(f\"{title}: a={coeffs[0]}\\tb={coeffs[1]}\")\n",
@ -782,11 +876,15 @@
@@ -782,11 +876,15 @@
},
{
"cell_type": "code",
"execution_count": 20 ,
"execution_count": 23 ,
"id": "dafd0f56-b6fa-4883-9640-01fdcb4a3870",
"metadata": {},
"outputs": [],
"source": [
"# update the \"CorrectedField\" column (copy of \"Field\" column) in the `meteo_df`\n",
"# by subtracting the linear regression adjustments \n",
"# based on temperature and wind anomalies.\n",
"\n",
"meteo_df[\"CorrectedField\"] = meteo_df[\"Field\"]\n",
"meteo_df[\"CorrectedField\"] -= temp_coeffs[0] * (meteo_df[\"T\"] - meteo_df[\"T\"].mean())\n",
"meteo_df[\"CorrectedField\"] -= wind_coeffs[0] * (meteo_df[\"Ff\"] - meteo_df[\"Ff\"].mean())"
@ -802,13 +900,14 @@
@@ -802,13 +900,14 @@
},
{
"cell_type": "code",
"execution_count": 21 ,
"execution_count": 24 ,
"id": "9a209c25-af4a-4c67-80cc-2ce657f9cb3d",
"metadata": {},
"outputs": [],
"source": [
"# calculate seasonal var params (avg FW PG, count FW days, \n",
"# sum of squares of FW PG) for specific datasets and fields\n",
"# calculate seasonal variation parameters for specific years: \n",
"# average FW PG, count of FW days, and the sum of squares of FW PG hourly values\n",
"\n",
"data, data_counter, data_sqr = np.array(\n",
" [\n",
" calculate_seasonal_var_params(daily_edf[daily_edf.index.year <= 2001]),\n",
@ -823,7 +922,7 @@
@@ -823,7 +922,7 @@
},
{
"cell_type": "code",
"execution_count": 22 ,
"execution_count": 25 ,
"id": "84050d4a-259f-4513-8d91-0ffc9840f542",
"metadata": {},
"outputs": [
@ -841,8 +940,7 @@
@@ -841,8 +940,7 @@
"source": [
"fig = plt.figure(figsize=(10, 14), constrained_layout=False)\n",
"ax = [None for _ in range(4)]\n",
"# for n in range(4):\n",
"# ax[n] = fig.add_subplot(2, 2, n + 1)\n",
"\n",
"for n in range(4):\n",
" ax[n] = fig.add_subplot(4, 4, (2*n + 1, 2*n + 2))\n",
"\n",
@ -850,6 +948,7 @@
@@ -850,6 +948,7 @@
"high = [220] * 2 + [180] * 2\n",
"step = [20] * 4\n",
"coeff = [1] * 4\n",
"\n",
"caption = ['Vostok station, 1998–2001',\n",
" 'Vostok station, 1998–2001 (adjusted)',\n",
" 'Vostok station, 2006–2020',\n",