Browse Source

add dataset and script

master
nilyin 3 years ago
commit
a2ab63af88
  1. 88
      analyzer.py
  2. 105296
      traindata_sea.csv

88
analyzer.py

@ -0,0 +1,88 @@ @@ -0,0 +1,88 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import datetime
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# load train data
data=pd.read_csv('traindata_sea.csv')
# add feature colomn
data['flash'] = np.heaviside(data['lightning_count']-1,0)
# select target colomn
wwlln=data['flash']
# del all but sattelite data
del data['flash']
del data['lat']
del data['lon']
del data['ptime']
del data['lightning_count']
del data['avg_energy']
print(data.head())
folds=5 # how many folds
scaler = StandardScaler()
# select classificator
clf = LogisticRegression(penalty='l2', class_weight={1: 0.774})
#clf = RandomForestClassifier(n_estimators=50)
#clf = MLPClassifier(hidden_layer_sizes=(5,), shuffle=True,verbose=False)
accur, recall, f1 = 0, 0 ,0
matrix = np.zeros((4))
start_time = datetime.datetime.now()
coefs = np.empty([1,data.shape[1]])
for fold in range(folds):
print('N_fold is:', fold)
X_train, X_test, y_train, y_test = train_test_split(data, wwlln, test_size=1/folds, shuffle=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf.fit(X_train_scaled, y_train)
try:
coefs += clf.coef_
except:
print('There are NO coeffs for this clf!')
predict = clf.predict(X_test_scaled)
accur += precision_score(y_test, predict)
recall += recall_score(y_test, predict)
f1 += f1_score(y_test, predict)
matrix += confusion_matrix(y_test, predict).ravel()
print (f1)
time = datetime.datetime.now() - start_time
print (pd.DataFrame((coefs/folds)/np.max(np.abs(coefs/folds))))
print ('accur', accur/folds)
print ('recall', recall/folds)
print ('f1', f1/folds)
print ('TN', 'FP', 'FN', 'TP')
print (matrix)
print('Total events:', matrix.sum())
print('By set:', 105295)
if matrix.sum() == 105295:
print('Match! OK!')
print ('time', time)

105296
traindata_sea.csv

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save