#!/usr/bin/env python3 import pandas as pd import numpy as np import datetime import sys from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import * from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier # load train data data=pd.read_csv('traindata_sea.csv') # add feature colomn data['flash'] = np.heaviside(data['lightning_count']-1,0) # select target colomn wwlln=data['flash'] # del all but sattelite data del data['flash'] del data['lat'] del data['lon'] del data['ptime'] del data['lightning_count'] del data['avg_energy'] print(data.head()) folds=5 # how many folds scaler = StandardScaler() # select classificator clf = LogisticRegression(penalty='l2', class_weight={1: 0.774}) #clf = RandomForestClassifier(n_estimators=50) #clf = MLPClassifier(hidden_layer_sizes=(5,), shuffle=True,verbose=False) accur, recall, f1 = 0, 0 ,0 matrix = np.zeros((4)) start_time = datetime.datetime.now() coefs = np.empty([1,data.shape[1]]) for fold in range(folds): print('N_fold is:', fold) X_train, X_test, y_train, y_test = train_test_split(data, wwlln, test_size=1/folds, shuffle=True) X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf.fit(X_train_scaled, y_train) try: coefs += clf.coef_ except: print('There are NO coeffs for this clf!') predict = clf.predict(X_test_scaled) accur += precision_score(y_test, predict) recall += recall_score(y_test, predict) f1 += f1_score(y_test, predict) matrix += confusion_matrix(y_test, predict).ravel() print (f1) time = datetime.datetime.now() - start_time print (pd.DataFrame((coefs/folds)/np.max(np.abs(coefs/folds)))) print ('accur', accur/folds) print ('recall', recall/folds) print ('f1', f1/folds) print ('TN', 'FP', 'FN', 'TP') print (matrix) print('Total events:', matrix.sum()) print('By set:', 105295) if matrix.sum() == 105295: print('Match! OK!') print ('time', time)