# -*- coding: utf-8 -*-
import multiprocessing
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestRegressor
import sklearn_relief as rrelief
import traceback
import sys
import logging
#suppress warning from insert row method line:df1.loc[row_number] = row_value
import warnings
import Utils
warnings.filterwarnings("ignore", category=UserWarning)
#suppress warning from pandas
pd.options.mode.chained_assignment = None # default='warn'
logger = logging.getLogger(__name__)
def sigmoid(s, midpoint, t, ep_length):
return (1 / (1 + math.exp(s * (ep_length - midpoint - t))))
def preprocess_data_binarize(
dataset, chunks_type, unique_event_ids=[], check_unknown_events=False
):
if len(unique_event_ids) == 0:
unique_event_ids = dataset["Event_id"].unique()
unique_event_ids = unique_event_ids[unique_event_ids != -1]
# group by hour and merge all the event ids to a list
dataset = dataset.groupby(
pd.to_datetime(dataset["Timestamps"]).dt.floor(chunks_type)
)["Event_id"].apply(set)
# create a 2d array (the final dataset) with zeros and name the columns to
# event types
df = pd.DataFrame(
np.zeros(shape=(len(dataset), len(unique_event_ids))), columns=unique_event_ids
)
df["Timestamps"] = dataset.index.values
if check_unknown_events:
for i in range(len(dataset)):
if len(dataset[i]) == 0:
continue
for event in dataset[i]:
if event in df.columns:
df.loc[i, event] = 1
else:
for i in range(len(dataset)):
df.loc[i, dataset[i]] = 1
return df
def preprocess_data_remove_rare_events(df, target_event, remove_threshold):
"""
Remove the event types with frequency less than
remove_threshold*freq(target_event) or remove_threshold*mean_freq.
Parameters
----------
df : dataframe
dataset with the events
target_event : int
event id of the target event
remove_threshold : float
percentage of the frequency of the target event below of which the
events are removed
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if not df_valid(df):
logger.error(f"RRE df not valid")
raise
events_freq = df[
df.columns[~df.columns.isin(["Timestamps", "Timedeltas", target_event])]
].sum()
te_freq = df.loc[:, df.columns == target_event].sum()[target_event]
mean_freq = events_freq.mean()
rare_freq = te_freq * remove_threshold
if te_freq > mean_freq:
logger.warning(
f"RRE: Target event frequent "
f"{te_freq} "
f"is higher than the mean frequency of the rest of the events "
f"{mean_freq}"
)
if rare_freq > mean_freq:
logger.warning(
f"RRE: Rare event frequent "
f"{rare_freq} "
f"based on target event frequency "
f"{te_freq}*{remove_threshold} "
f"is higher than the mean frequency "
f"{mean_freq}"
f", changing rare threshold to mean based "
f"{mean_freq}*{remove_threshold}={mean_freq*remove_threshold}"
)
rare_freq = mean_freq * remove_threshold
remove_candidate_events = events_freq[events_freq < rare_freq].index
logger.info(
f"RRE: Removing "
f"{len(remove_candidate_events)} "
f"events out of the "
f"{len(events_freq)} "
f"total events "
)
df = df.drop(columns=remove_candidate_events)
df = remove_empty_episodes(df, target_event)
if not df_valid(df):
logger.error(f"RRE parameters too strict")
raise
return df
def remove_empty_episodes(df, target_event):
"""
Remove the episodes with no rows.
Parameters
----------
df : np.DataFrame
The dataset
target_event : int
The target event id
Returns
-------
df : np.DataFrame
The cleaned dataset.
"""
remove_candidates = []
te_indeces = df.index[df[target_event] == 1].tolist()
for i in range(len(te_indeces) - 1):
if te_indeces[i] + 1 == te_indeces[i + 1]:
remove_candidates.append(te_indeces[i + 1])
if len(remove_candidates) > 0:
logger.info(
f"CLEAN: Removed {len(remove_candidates)} empty episodes"
f" remaining {len(te_indeces)-len(remove_candidates)} episodes"
)
df.drop(remove_candidates, inplace=True)
df.set_index(pd.Series(range(len(df))), inplace=True)
return df
def preprocess_data_remove_frequent_events(df, target_event, remove_threshold):
"""
Remove the event types with frequency more than remove_threshold*max_freq.
Parameters
----------
df : dataframe
dataset with the events
target_event : int
event id of the target event
remove_threshold : float
percentage of the frequency of the target event below of which the
events are removed
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if not df_valid(df):
logger.error(f"RFE df not valid")
raise
events_freq = df[
df.columns[~df.columns.isin(["Timestamps", "Timedeltas", target_event])]
].sum()
max_freq_event = events_freq.idxmax()
if str(max_freq_event) == str(target_event):
logger.warning(
f"RFE: Target event is the most frequent "
f"{events_freq.loc[target_event]} "
f"no meaning in removing frequent events. "
f"Remove frequency won't run"
)
return df
else:
max_freq = events_freq.loc[max_freq_event]
remove_candidate_events = events_freq[
events_freq >= max_freq - (max_freq * remove_threshold)
].index
if len(remove_candidate_events) >= len(df.columns) - 4:
logger.info(
f"RFE: No meaning in applying RFE as all the events will be removed. "
f"{len(remove_candidate_events)} "
f"events out of the "
f"{len(df.columns) - 4} "
f"total events "
)
return df
logger.info(
f"RFE: Maximum frequency is "
f"{max_freq}. "
f"Removing "
f"{len(remove_candidate_events)}"
f" events with frequency >= "
f"{max_freq - (max_freq * remove_threshold)}"
)
if target_event in remove_candidate_events:
remove_candidate_events = remove_candidate_events.drop(target_event)
df = df.drop(columns=remove_candidate_events)
df = remove_empty_episodes(df, target_event)
if not df_valid(df):
logger.error(f"RFE parameters too strict")
raise
return df
def preprocess_data_keep_only_first_event(df, target_event):
"""
Keeps only the first occurence of events per episode
(i.e. episodes = between target_event occurences).
Parameters
----------
df : dataframe
dataset with the events
target_event : int
event id of the target event
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if not df_valid(df):
logger.error(f"KOFE df not valid")
raise
start_row_index = 0
removed_events_cnt = 0
for index, row in df.iterrows():
if row[target_event] == 1:
# slice the df to the episode of the target event
# (excluding the row of the target even)
# change index to index+1 if you want to include the target event
# row
episode = df.iloc[
start_row_index:index,
]
episode = episode[
episode.columns[
~episode.columns.isin(["Timestamps", "Timedeltas", target_event])
]
]
for column_index in episode:
column = episode[column_index]
for i in range(
len(column) - 1, 0, -1
): # iterating bottom up to search for trailing 1s
if column.iloc[i] == 1 and column.iloc[i] == column.iloc[i - 1]:
actual_row_index = start_row_index + i
df.loc[actual_row_index, column_index] = 0
removed_events_cnt += 1
if index + 1 < len(df): # start from the row after the target event
start_row_index = index + 1
if removed_events_cnt > 0:
logger.info(f"KOFE: Removed {removed_events_cnt} events")
df = remove_empty_episodes(df, target_event)
if not df_valid(df):
logger.error(f"KOFE parameters too strict")
raise
return df
def compute_risk(df, target_event, s, midpoint, buffer_time=None, risk_ignore_buffer=True):
"""
Compute the risk based on the Sigmoid function.
Parameters
----------
df : pd.DataFrame
Dataset with the events
target_event : int
event id of the target event
ep_length : str
The episode length expressed with a string e.g. '8H'
s : float
steepness
midpoint : str
skewness expressed with a string e.g. '8H'
buffer_time : str
The buffered time before the target event to which the risk should be
zero, expressed with a string e.g. '1H'
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if "Timedeltas" not in df.columns:
df["Timedeltas"] = compute_timedeltas_mins(df, target_event)
midpoint_mins = Utils.convert_hours_to_mins(Utils.strtime_to_hours(midpoint))
buffer_time_mins = 0
if buffer_time is not None:
buffer_time_mins = Utils.convert_hours_to_mins(Utils.strtime_to_hours(buffer_time))
start_row_index = 0
for index, row in df.iterrows():
if row[target_event] == 1:
# slice the df to the episode of the target event (including the row of the target event)
# change index+1 to index if you want to exclude the target event row
episode = df.iloc[
start_row_index : (index + 1),
]
for seg_index, ep_row in episode.iterrows():
segment_time_mins = episode.loc[seg_index, "Timedeltas"]
if buffer_time is not None and segment_time_mins <= buffer_time_mins:
df.loc[seg_index, "Risk"] = 0
else:
if buffer_time is not None and risk_ignore_buffer:
segment_time_mins = segment_time_mins - buffer_time_mins
df.loc[seg_index, "Risk"] = Utils.sigmoid_risk(
s, midpoint_mins, segment_time_mins
)
start_row_index = index + 1
return df
def insert_row(row_number, df, row_value):
"""
Function to insert row in the dataframe
Parameters
----------
row_number : int
the index of the row to insert
df : dataframe
the dataframe
row_value : int
the value to insert
Returns
-------
df : np.DataFrame
The dataset with the inserted row.
"""
# Slice the upper half of the dataframe
df1 = df[0:row_number]
# Store the result of lower half of the dataframe
df2 = df[row_number:]
# Insert the row in the upper half dataframe
df1.loc[row_number] = row_value
# Concat the two dataframes
df_result = pd.concat([df1, df2])
# Reassign the index labels
df_result.index = [*range(df_result.shape[0])]
return df_result
def binary_sum(x):
if x.sum() > 0:
return 1.0
else:
return 0.0
def df_valid(df):
return len(df) > 0 and len(df.columns) > 4
def preprocess_data_MIL_oversampling(df, target_event, window_size, threshold):
"""
Apply Multi-Instance Learning oversampling the events closer to the target
event. The merged row is added right below the window from which is computed
Parameters
----------
df : dataframe
dataset with the events
target_event : int
event id of the target event
window_size : int
the window size of the sampling
threshold : float
specifies when the oversampling begins
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if not df_valid(df):
logger.error(f"MIL_OVER df not valid")
raise
new_rows = []
start_row_index = 0
for index, row in df.iterrows():
if row[target_event] == 1:
# slice the df to the episode of the target event (excluding the
# row of the target even)
# change index to index+1 if you want to include the target event row
episode = df.iloc[
start_row_index:index,
]
ep_index = 0
for row_index, ep_row in episode.iterrows():
if (
row_index + window_size >= index + 1
): # stop right before the target event
break
if ep_row["Risk"] >= threshold:
window = episode.iloc[
ep_index : (ep_index + window_size),
]
new_row = window[
window.columns[
~window.columns.isin(["Timestamps", "Timedeltas", "Risk"])
]
].apply(binary_sum, axis=0)
new_row["Risk"] = window.tail(1)["Risk"].item()
new_row["Timestamps"] = np.datetime_as_string(
window.tail(1)["Timestamps"], unit="s"
)[
0
] # convert nanosec to string Datetime
new_row["Timedeltas"] = window.tail(1)["Timedeltas"].item()
new_rows.append(
(row_index + window_size - 1 + len(new_rows), new_row)
) # row_index is the index of the row of the df
ep_index += 1
if index + 1 < len(df): # start from the row after the target event
start_row_index = index + 1
logger.info(
f"MIL_OVER: Adding {len(new_rows)} new/merged occurrences " f"of events"
)
for row_index, values in new_rows:
df = insert_row(row_index, df, values)
return df
def preprocess_data_feature_selection(df, target_event):
"""
Apply Feature Selection and more specifically the RRelief algorithm.
It keeps only the events with non-zero weight.
Parameters
----------
df : dataframe
dataset with the events
target_event : int
event id of the target event
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if not df_valid(df):
logger.error(f"FS df not valid")
raise
r = rrelief.Relief()
input_matrix = df.iloc[:, :-3].to_numpy()
label_vector = df.iloc[:, :-3].columns.values
transformed_matrix = r.fit_transform(input_matrix, df["Risk"].array)
weights = dict(zip(label_vector, r.w_))
fs = {k: v for k, v in weights.items() if v > 0}
# fs = dict(sorted(weights.items(), key=lambda x: x[1], reverse=True)[:top])
keep_column_labels = [*fs.keys()]
if target_event not in fs.keys():
keep_column_labels.append(target_event)
keep_column_labels.append("Risk")
keep_column_labels.append("Timestamps")
keep_column_labels.append("Timedeltas")
df = df[keep_column_labels]
if len(weights) - len(fs) > 0:
logger.info(
f"FS: Removed {len(weights)-len(fs)} out of {len(weights)} features"
)
if not df_valid(df):
logger.error(f"FS parameters too strict")
raise
return df
def compute_risk(df, target_event, s, midpoint, buffer_time=None, risk_ignore_buffer=True):
"""
Compute the risk based on the Sigmoid function.
Parameters
----------
df : pd.DataFrame
Dataset with the events
target_event : int
event id of the target event
ep_length : str
The episode length expressed with a string e.g. '8H'
s : float
steepness
midpoint : str
skewness expressed with a string e.g. '8H'
buffer_time : str
The buffered time before the target event to which the risk should be
zero, expressed with a string e.g. '1H'
Returns
-------
df : np.DataFrame
The preprocessed dataset.
"""
if "Timedeltas" not in df.columns:
df["Timedeltas"] = compute_timedeltas_mins(df, target_event)
midpoint_mins = Utils.convert_hours_to_mins(Utils.strtime_to_hours(midpoint))
buffer_time_mins = 0
if buffer_time is not None:
buffer_time_mins = Utils.convert_hours_to_mins(Utils.strtime_to_hours(buffer_time))
start_row_index = 0
for index, row in df.iterrows():
if row[target_event] == 1:
# slice the df to the episode of the target event (including the row of the target event)
# change index+1 to index if you want to exclude the target event row
episode = df.iloc[
start_row_index : (index + 1),
]
for seg_index, ep_row in episode.iterrows():
segment_time_mins = episode.loc[seg_index, "Timedeltas"]
if buffer_time is not None and segment_time_mins <= buffer_time_mins or index == seg_index:
df.loc[seg_index, "Risk"] = 0
else:
if buffer_time is not None and risk_ignore_buffer:
segment_time_mins = segment_time_mins - buffer_time_mins
df.loc[seg_index, "Risk"] = Utils.sigmoid_risk(
s, midpoint_mins, segment_time_mins
)
start_row_index = index + 1
return df
def compute_timedeltas_mins(df, target_event):
start_row_index = 0
dataset_timedeltas = []
for index, row in df.iterrows():
if row[target_event] == 1:
# slice the df to the episode of the target event (excluding the
# row of the target even)
# change index to index+1 if you want to include the target event row
episode = df.iloc[
start_row_index : index + 1,
]
te_date = episode["Timestamps"].values[-1]
for timestamp in episode["Timestamps"].values:
timedelta = te_date - timestamp
time_distance_mins = timedelta.astype('timedelta64[m]') / np.timedelta64(1, 'm')
dataset_timedeltas.append(time_distance_mins)
if index + 1 < len(df): # start from the row after the target event
start_row_index = index + 1
return dataset_timedeltas
def train_rf_model(dataset, chunks_type, target_event, s, midpoint, rre=False, rfe=False, kofe=False, mil_over=False,
mil_down=False, fs=False, rre_thres=0.2,rfe_thres=0.1,mil_thres=0.8,mil_window_size=2, fs_top=20,
rf_n_jobs=multiprocessing.cpu_count(), rf_max_depth=12, rf_random_state=0,rf_n_estimators=1000):
try:
df = preprocess_data_binarize(dataset, chunks_type)
df["Timedeltas"] = compute_timedeltas_mins(df, target_event)
if rre:
df = preprocess_data_remove_rare_events(df,target_event,rre_thres)
if rfe:
df = preprocess_data_remove_frequent_events(df,target_event,rfe_thres)
if kofe:
df = preprocess_data_keep_only_first_event(df, target_event)
df = compute_risk(df, target_event, s, midpoint)
if mil_over:
df = preprocess_data_MIL_oversampling(df,target_event,mil_window_size,mil_thres)
if fs:
df = preprocess_data_feature_selection(df, target_event)
except Exception as err:
print(err)
traceback.print_exc(file=sys.stdout)
regr = RandomForestRegressor(n_jobs=rf_n_jobs, max_depth=rf_max_depth, random_state=rf_random_state,
n_estimators=rf_n_estimators)
train_df = df.iloc[:, :-3].reindex(sorted(df.iloc[:, :-3].columns), axis=1)
regr.fit(train_df, df['Risk'])
# print(regr.predict([df.iloc[0,:-1]]))
feature_importance = pd.DataFrame(regr.feature_importances_,
index=train_df.columns,
columns=['importance']).sort_values('importance', ascending=False)
return (regr, feature_importance)
def predict(regr, dataset, chunks_type, unique_event_ids=None):
df = preprocess_data_binarize(dataset, chunks_type, unique_event_ids, check_unknown_events=True)
return regr.predict(df.iloc[:, :-1])
# if (len(df) > 0):
# print(regr.predict(df))
# predictions.append(regr.predict(df))
# return predictions
# s = 0.7
# midpoint = 2
# target_event = 35544
# dataset = pd.read_csv('test-df.csv', parse_dates=True)
#
# train_rf_model(dataset, 'H', target_event, s, midpoint)