Supervised_Predictor_v5.r 30.1 KB
edit raw blame history

suppressMessages(library(CORElearn))
suppressMessages(library(dplyr))
suppressMessages(library(plyr))
suppressMessages(library(data.table))
suppressMessages(library(randomForest))
suppressMessages(library(ggplot2))
suppressMessages(library(grid))
suppressMessages(library(argparser))
suppressMessages(library(stringr))
suppressMessages(library(lubridate))

export_ds_for_spm <- function(target_event,episodes_list,output){
  if (file.exists(output)) {
    file.remove(output)
  }
  #output for HirateYamana
  for(ep_index in (1:length(episodes_list))){
    ep = episodes_list[[ep_index]][ , !(names(episodes_list[[ep_index]]) %in% c("Timestamps"))]
    ep_list = list()
    for(i in (1:nrow(ep))){
      matches = which(ep[i,] %in% c(1))
      if(length(matches) == 0){
        next
      }
      line=paste(matches,collapse=" ")
      ep_list[i] = line
    }
    if(length(ep_list) == 0){
      next
    }
    ep_list[length(ep_list)+1] = target_event
    episode = ""
    for(ep_lli in (1:length(ep_list))){
      if(length(ep_list[[ep_lli]]) > 0){
        index = paste(paste("<",ep_lli,sep=""),">",sep="")
        if(episode == ""){
          episode = paste(index,ep_list[[ep_lli]],sep=" ")
        } else {
          episode = paste(episode,paste(index,ep_list[[ep_lli]],sep=" "),sep=" -1 ")
        }
      }
    }
    write(paste(episode,"-1 -2"),file=output,append=TRUE)
  }
}

remove_rare_events <- function(ds,target_event_frequency_proportion_rare){
  if(!csv){
    print("~~~~~~~APPLYING PREPROCESSING REMOVE RARE EVENTS~~~~~~~")
  }
  a = table(ds$Event_id)
  target_event_frequency = a[names(a)==target_event][[1]]
  rare_events = as.integer(names(a[a < target_event_frequency*target_event_frequency_proportion_rare]))
  return(ds[!(ds$Event_id %in% rare_events),])
}

remove_frequent_events <- function(ds,max_event_frequency_proportion_frequent){
  if(!csv){
    print("~~~~~~~APPLYING PREPROCESSING REMOVE FREQUENT EVENTS~~~~~~~")
  }
  a = table(ds[!(ds$Event_id == target_event),]$Event_id)
  max_freq = sort(a,decreasing = TRUE)[[1]]
  frequent_events = as.integer(names(a[a > max_freq*max_event_frequency_proportion_frequent]))
  #print(frequent_events)
  return(ds[!(ds$Event_id %in% frequent_events),])
}

keep_only_first_occureness <- function(episodes_list){
  if(!csv){
    print("~~~~~~~APPLYING PREPROCESSING KEEP ONLY FIRST OCCURENESS~~~~~~~")
  }
  #for every episode in the episodes_list
  for(ep_index in (1:length(episodes_list))){
    ep = episodes_list[[ep_index]]
    #For every segment of each episode starting from the end up to the second segment.
    #We need to keep only the 1st occurness of consequtive events, hence starting from the end is the easy way.
    if(nrow(ep) < 2){
      next
    }
    for(i in (nrow(ep):2)){
      #as we deal with binary vectors, to find the indeces that both vectors have "1" we sum them and check for "2"s in the result
      matches = which((ep[i,]+ep[i-1,]) %in% c(2))
      #replace the 1s with 0s in the matching positions of the segment that is closer to the end of the episode
      ep[i,][c(matches)] = 0
    }
    episodes_list[[ep_index]] = ep
  }
  return(episodes_list)
}

mil_text <- function(milw,F_thres,episodes_list,b_length){
  if(!csv){
    print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
  }
  window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))

  #for every episode in the episodes_list
  for(ep_index in (1:length(episodes_list))){
    ep = episodes_list[[ep_index]]
    if(nrow(ep) < 1){
      next
    }
    new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
    i = 1
    while(i <= nrow(ep)){
      new_ep = rbind(new_ep,ep[i,])

      if(ep[i,][b_length+1] >= F_thres && nrow(window_df) < milw){
        window_df = rbind(window_df,ep[i,])
      }
      if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
        mean = colMeans(window_df)
        mean[mean > 0] = 1
        mf <- data.frame(as.list(mean))
        mf[1] = ep[i,][1]
        mf[b_length+1] = ep[i,][b_length+1]
        colnames(mf) = colnames(new_ep)
        new_ep = rbind(new_ep,mf)
        if(nrow(window_df) > 1){
          i = i - (nrow(window_df)-2)
        }
        window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
      }
      i = i + 1
    }
    episodes_list[[ep_index]] = new_ep
  }
  return(episodes_list)
}

mil_image <- function(milw,F_thres,episodes_list,b_length){
  if(!csv){
    print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
  }

  #for every episode in the episodes_list
  for(ep_index in (1:length(episodes_list))){
    ep = episodes_list[[ep_index]]
    if(nrow(ep) < 1){
      next
    }
    new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
    #a data.frame with the vectors that need to be averaged
    window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
    i = 1
    while(i <= nrow(ep)){
      #new_ep = rbind(new_ep,ep[i,])
      if(nrow(window_df) < milw){
        window_df = rbind(window_df,ep[i,])
      }
      if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
        mean = colMeans(window_df)
        mean[mean > 0] = 1
        mf = data.frame(as.list(mean))
        mf[1] = ep[i,][1]
        mf[b_length+1] = ep[i,][b_length+1]
        #colnames(mf) = colnames(new_ep)
        new_ep = rbind(new_ep,mf)
        if(window_df[1,][b_length+1] >= F_thres && nrow(window_df) > 1){
          i = i - (nrow(window_df)-1)
        }
        window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
      }
      i = i + 1
    }
    episodes_list[[ep_index]] = new_ep
  }
  return(episodes_list)
}

#the Risk function
compute_F <- function(s,midpoint,t,ep_length){
  #s affects the steepness
  # s <- 0.9
  return(1/(1+exp(s*(ep_length-midpoint-t))))
}

#convert event vectors to binary vectors
compute_frequency_vectors <- function(aggr_episode_df,b_length,s,midpoint){
  freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+2, nrow = 0))
  x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
  # colnames(bin_aggr_episode_df) <- x
  for(i in 1:nrow(aggr_episode_df)) {
    #init a vector with target_event 0s
    freq_vector = as.vector(integer(b_length))
    seg <- aggr_episode_df[i,]
    #if segment contains the j number, replace the 0 in the bin_vector with 1
    for(value in seg$x[[1]]){
      if(is.na(value)){
        next
      }
      freq_vector[[value]] = length(which(seg$x[[1]] == value))
    }
    #add a new line to the bin_aggr_epissode_df
    #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types
    F = compute_F(s,midpoint,i-1,nrow(aggr_episode_df))
    if(midpoint >= nrow(aggr_episode_df) || sum(freq_vector) == 0 ){
      F = 0
    }
    date = seg$Timeframe[[1]]
    new_df = data.frame(matrix(c(date, freq_vector,F),nrow=1,ncol=b_length+2))
    freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df)
  }
  # x <- c(c("Timestamps"), c(paste("e_",c(1:3405))), c("Risk_F"))
  colnames(freq_aggr_episode_df) <- x
  return(freq_aggr_episode_df)
}

create_episodes_list <- function(ds,target_event,b_length,s,midpoint,test_mode){
  if(!csv){
    print("~~~~~~~CREATING FREQUENCY VECTORS AND BINARIZE THEM~~~~~~~")
  }
  #devide in episodes
  target_event_spotted = FALSE
  #a list with data.frames for the episodes (each episode one data.frame)
  episodes_list = list()
  #data.frame for episodes
  episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
  #iterate over every line of the original dataset
  for(i in 1:nrow(ds)) {
    #get the current row of the ds
    meas <- ds[i,]
    #If it is the target event enable the appropriate flag
    if((meas$Event_id == target_event)){
      target_event_spotted = TRUE
    }
    #fill the episode data.frame with the events that are between two target events
    if(meas$Event_id != target_event && target_event_spotted){
      episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id))
    } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) != 0){
      episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=NA))

      #a second occurness of the target event is spotted, close the episode
      #target_event_spotted = FALSE
      #aggregate by day all the events to form the segments inside the episodes
      labels_seq = seq(floor_date(as.POSIXct(head(episode_df,1)$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),'hour'), floor_date(as.POSIXct(tail(episode_df,1)$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),'hour'), by="1 hour")
      aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(Timeframe=cut(as.POSIXct(episode_df$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"hour"))) #%Y-%m-%dT%H:%M:%OSZ

      aggr_episode_df[, 1]  <- as.POSIXct(aggr_episode_df[, 1])
      full = data.frame(Timeframe=labels_seq)
      full[, 1]  <- as.POSIXct(full[, 1])
      aggr_episode_df = merge(full,aggr_episode_df,by='Timeframe',all.x=TRUE)

      #aggr_episode_df = head(aggr_episode_df, -1)


      #binarize the frequncy vector
      bin_aggr_episode_df = compute_frequency_vectors(aggr_episode_df,b_length,s,midpoint)

      #Remove event 0, which does not provide any info KOUGKA
      bin_aggr_episode_df = bin_aggr_episode_df[ , !(names(bin_aggr_episode_df) %in% c("e_1"))]

      #add the episode to the episodes_list
      episodes_list[[length(episodes_list)+1]] = bin_aggr_episode_df
      #reset episode_df to en empty data.frame
      episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
    } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) == 0 && length(episodes_list) > 0 && test_mode){
      freq_vector = as.vector(integer(b_length))
      new_df = data.frame(matrix(c(0, freq_vector,0),nrow=1,ncol=b_length+2))
      episode_df <- rbind(episode_df,new_df)

      x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
      colnames(episode_df) <- x
      episode_df = episode_df[ , !(names(episode_df) %in% c("e_1"))]

      #add the episode to the episodes_list
      episodes_list[[length(episodes_list)+1]] = episode_df
      #reset episode_df to en empty data.frame
      episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
    }
  }
  return(episodes_list)
}

preprocess <- function(ds,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,w,F_thres,test_mode){

  #Remove events that appear < n times. We consider n = (target event frequency)/2
  if(REMOVE_RARE_EVENTS){
    ds<-remove_rare_events(ds,target_event_frequency_proportion_rare)
  }

  #Remove events that appear < n times. We consider n = (target event frequency)/2
  if(REMOVE_FREQUENT_EVENTS){
    ds<-remove_frequent_events(ds,max_event_frequency_proportion_frequent)
  }

  episodes_list = create_episodes_list(ds,target_event,b_length,s,midpoint,test_mode)
  #if(length(episodes_list) == 0){
  # return()
  #}

  #binarize the vector
  for(ep_index in (1:length(episodes_list))){
    ep = episodes_list[[ep_index]]
    ep[2:(ncol(ep)-1)][ep[2:(ncol(ep)-1)] > 0] = 1
    episodes_list[[ep_index]] = ep
  }

  # keep only the first occurness of event in consecutive segments
  if(KEEP_ONLY_FIRST_OCCURENESS){
    episodes_list <- keep_only_first_occureness(episodes_list)
  }

  # Multi-instance learning to increase the pattern frequency
  if(MULTI_INSTANCE_LEARNING_TEXT){
    episodes_list <- mil_text(w,F_thres,episodes_list,b_length)
  } else if(MULTI_INSTANCE_LEARNING_IMAGE){
    episodes_list <- mil_image(w,F_thres,episodes_list,b_length)
  }
  return(episodes_list)
}

feature_selection <- function(merged_episodes,top_features){
  if(!csv){
    print("~~~~~~~APPLYING FEATURE SELECTION~~~~~~~")
  }
  estReliefF <- attrEval(Risk_F ~ ., merged_episodes, estimator="RReliefFexpRank", ReliefIterations=50)
  sorted_indeces = order(estReliefF, decreasing = TRUE)
  merged_episodes = merged_episodes %>% select(sorted_indeces[1:min(top_features,length(sorted_indeces))],ncol(merged_episodes))
  return(merged_episodes)
}

read_dataset <- function(path){
  dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  dataset[, 2]  <- as.numeric(dataset[, 2])
  return(dataset)
}

read_dataset_cross <- function(path){
  dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  dataset[, 2]  <- as.numeric(dataset[, 2])

  aggr_episode_df = aggregate(dataset, FUN=function(x){return(x)}, by=list(cut(as.POSIXct(dataset$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"day"))) #%Y-%m-%dT%H:%M:%OSZ

  training = sample_n(aggr_episode_df,(2*nrow(aggr_episode_df)/3))

  testing = setDT(aggr_episode_df)[!training, on="Group.1"]

  testing = testing[, !"Group.1"]
  testing_df = data.frame(Timestamps=unlist(testing[,1][1][[1]]),Event_id=unlist(testing[,2][1][[1]]))
  for(i in 2:nrow(testing)){
    testing_df = rbind(testing_df,data.frame(Timestamps=unlist(testing[,1][i][[1]]),Event_id=unlist(testing[,2][i][[1]])))
  }

  training = as.data.table(training)[, !"Group.1"]
  training_df = data.frame(Timestamps=unlist(training[,1][1][[1]]),Event_id=unlist(training[,2][1][[1]]))
  for(i in 2:nrow(training)){
    training_df = rbind(training_df,data.frame(Timestamps=unlist(training[,1][i][[1]]),Event_id=unlist(training[,2][i][[1]])))
  }

  return(list("training" = training_df, "testing"=testing_df))
}

read_dataset_cross2 <- function(path,form){
  dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  dataset[, 2]  <- as.numeric(dataset[, 2])
  if(form == 1){
    training_df = dataset[1:(2*nrow(dataset)/3),]
    testing_df = dataset[(2*nrow(dataset)/3):nrow(dataset),]
  } else if (form == 2){
    training_df = dataset[(nrow(dataset)/3):nrow(dataset),]
    testing_df = dataset[1:(nrow(dataset)/3),]
  } else {
    training_df = rbind(dataset[1:(nrow(dataset)/3),],dataset[(2*nrow(dataset)/3):nrow(dataset),])
    testing_df = dataset[(nrow(dataset)/3+1):(2*nrow(dataset)/3),]
  }
  return(list("training" = training_df, "testing"=testing_df))
}

find_next_fake_episodes <- function(ep_index,test_episodes_list){
  fake_episodes_vec = vector()
  cnt = 0
  if(ep_index < length(test_episodes_list)){
    for(i in ((ep_index+1):length(test_episodes_list))){
      n_ep = test_episodes_list[i]
      if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
        cnt = cnt + 1
        fake_episodes_vec <- c(fake_episodes_vec, i)
      } else {
        return(fake_episodes_vec)
      }
    }
  }
  return(fake_episodes_vec)
}

find_next_close_episodes <- function(ep_index,test_episodes_list,pred_index){
  ep_length = nrow(test_episodes_list[ep_index][[1]])
  remaining_hours = max_warning_interval-(ep_length-pred_index)

  nc_episodes_vec = vector()
  cnt = 0
  while (remaining_hours > 0 && ep_index < length(test_episodes_list)) {
    ep_index = ep_index + 1
    n_ep = test_episodes_list[ep_index]
    if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
      next
    }
    n_ep_length = nrow(n_ep[[1]])
    if(n_ep_length <= remaining_hours){
      remaining_hours = remaining_hours-n_ep_length
      if(remaining_hours >= 0){
        cnt = cnt + 1
        nc_episodes_vec <- c(nc_episodes_vec, ep_index)
      }
    } else{
      return(nc_episodes_vec)
    }
  }
  return(nc_episodes_vec)
}

eval <- function(train_episodes,test_episodes_list,seed,r){
  set.seed(seed)
  my.rf = randomForest(Risk_F ~ .,data=train_episodes,importance=TRUE)
  #varImpPlot(my.rf)
  false_positives = 0
  tp_vec = vector()
  fp_vec = vector()
  fn_vec = vector()
  true_positives = 0
  false_negatives = 0
  ep_index = 1
  while (ep_index <= length(test_episodes_list)) {
    ep = test_episodes_list[[ep_index]]
    fake_episodes_vec = find_next_fake_episodes(ep_index,test_episodes_list)
    fake_ep_cnt = length(fake_episodes_vec)

    ep = ep[ , !(names(ep) %in% c("Timestamps"))]
    Prediction <- predict(my.rf, ep)
    ep_legth = length(Prediction)
    pred_indeces = as.numeric(names(Prediction[Prediction >= acceptance_threshold]))
    predicted_next_episodes = 0
    if(length(pred_indeces) > 0){
      pred_index = tail(pred_indeces, n=1)
      nc_episodes_vec = find_next_close_episodes(ep_index,test_episodes_list,pred_index)
      predicted_next_episodes = length(nc_episodes_vec)
    }


    if(length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) > 0){
      fp_reps = length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))])
      false_positives = false_positives + fp_reps + (fake_ep_cnt*fp_reps)
      fp_vec = c(fp_vec,rep(ep_index,fp_reps),rep(fake_episodes_vec,fp_reps))
    }
    if(length(pred_indeces[pred_indeces >= (ep_legth-(max_warning_interval)) & pred_indeces <= (ep_legth-min_warning_interval)]) > 0){
      true_positives = true_positives + 1 + fake_ep_cnt + predicted_next_episodes
      tp_vec = c(tp_vec,ep_index,fake_episodes_vec,nc_episodes_vec)
    } else {
      if(predicted_next_episodes > 0){
        true_positives = true_positives + predicted_next_episodes
        tp_vec = c(tp_vec,nc_episodes_vec)
      }
      false_negatives = false_negatives + 1 + fake_ep_cnt
      fn_vec = c(fn_vec,ep_index,fake_episodes_vec)
    }
    ep_index = ep_index + 1 + fake_ep_cnt + predicted_next_episodes
  }

  precision = true_positives/(true_positives+false_positives)
  if((true_positives+false_positives) == 0){
    precision = 0
  }
  recall = true_positives/length(test_episodes_list)

  F1 = 2*((precision*recall)/(precision+recall))
  if((precision+recall) == 0){
    F1 = 0
  }
  if(!csv){
    cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\ntp:",paste(as.character(tp_vec), sep="", collapse=","), "\nfp:",paste(as.character(fp_vec), sep="", collapse=","), "\nfn:",paste(as.character(fn_vec), sep="", collapse=",")))
  } else {
    cat(paste(argv$test,",",true_positives,",",false_positives,",",false_negatives,",",precision,",",recall,",",F1,",",argv$fet,",",argv$tet,",",argv$rre,",",argv$rfe,",",argv$kofe,",",argv$mili,",",argv$milt,",",argv$fs,",",argv$top,",",argv$rer,",",argv$fer,",",argv$seed,",",argv$steepness,",",argv$pthres,",",argv$milw,",",argv$milthres,",",argv$midpoint,",",argv$minwint,",",argv$maxwint,",",r,",tp,",paste(as.character(tp_vec), sep="", collapse=","),",fp,",paste(as.character(fp_vec), sep="", collapse=","),",fn,",paste(as.character(fn_vec), sep="", collapse=","),"\n",sep=""))
  }
  return(my.rf)
}

plot <- function(test_episodes_list, episode_index, my.rf){
  test_episodes = test_episodes_list[[episode_index]][ , !(names(test_episodes_list[[episode_index]]) %in% c("Timestamps"))]
  Prediction <- predict(my.rf, test_episodes)
  results = data.frame(Risk_F=test_episodes$Risk_F,num_Prediction=as.numeric(Prediction))
  mse = mean((Prediction-test_episodes$Risk_F)^2)

  chart =ggplot(results,aes((1:nrow(results)))) +
    # geom_rect(aes(xmin = ceiling(nrow(df_test)/2), xmax = nrow(df_test), ymin = -Inf, ymax = Inf),
    #           fill = "yellow", alpha = 0.003) +
    geom_line(aes(y = Risk_F, colour = "Actual")) +
    geom_line(aes(y = num_Prediction, colour="Predicted")) +
    labs(colour="Lines") +
    xlab("Segments") +
    ylab('Risk (F)') +
    ggtitle("Risk Prediction") + # (RR_KF_2YEARS_PAT08)
    theme(plot.title = element_text(hjust = 0.5)) +
    geom_text(aes(label = paste("MSE=",round(mse,3)), x = 20, y = 1), hjust = -2, vjust = 6, color="black", size=4) #add MSE label

  # Disable clip-area so that the MSE is shown in the plot
  gt <- ggplot_gtable(ggplot_build(chart))
  gt$layout$clip[gt$layout$name == "panel"] <- "off"
  grid.draw(gt)
}


p <- arg_parser("Implementation of the AIRBUS Predictor")
# Add a positional argument
p <- add_argument(p, "id", help="experiment ID")
p <- add_argument(p, "train", help="training dataset")
p <- add_argument(p, "test", help="test dataset")
p <- add_argument(p, "fet", help="different types of the fault events",default=151)
p <- add_argument(p, "tet", help="type of the target fault events",default=151)
p <- add_argument(p, "--rre", help="remove rare events", default=FALSE)
p <- add_argument(p, "--rfe", help="remove frequent events", default=FALSE)
p <- add_argument(p, "--kofe", help="keep only first event", default=FALSE)
p <- add_argument(p, "--milt", help="MIL as written in the text of the paper", default=TRUE)
p <- add_argument(p, "--mili", help="MIL as shonw in the Figure of the paper", default=FALSE)
p <- add_argument(p, "--milthres", help="MIL threshold to the sigmoid function for over-sampling", default=0.3)
p <- add_argument(p, "--steepness", help="steepness of the sigmoid function", default=0.8)
p <- add_argument(p, "--midpoint", help="midpoint of the sigmoid function (in days)", default=4)
p <- add_argument(p, "--fs", help="apply feature selection", default=TRUE)
p <- add_argument(p, "--top", help="# of features to keep in feature selection", default=50)
p <- add_argument(p, "--rer", help="rare events ratio of the target event frequency", default=0.2)
p <- add_argument(p, "--fer", help="frequent events ratio of the frequency of the most frequent event", default=0.8)
p <- add_argument(p, "--milw", help="MIL window size (in days)", default=2)
p <- add_argument(p, "--pthres", help="prediction threshold to the Risk value for a true positive episode", default=0.35)
p <- add_argument(p, "--seed", help="seed for RF", default=500)
p <- add_argument(p, "--csv", help="output for csv", default=FALSE)


p <- add_argument(p, "--spme", help="export datasets for sequential pattern minning", default=FALSE)
p <- add_argument(p, "--java", help="the java path", default="/usr/bin/java")
p <- add_argument(p, "--python", help="the java path", default="/usr/bin/python")
p <- add_argument(p, "--cep", help="the java path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spm_rules.py")
p <- add_argument(p, "--spmf", help="the spmf path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spmf.jar")
p <- add_argument(p, "--conf", help="minimum support (minsup)", default="20%")
p <- add_argument(p, "--minti", help="minimum time interval allowed between two succesive itemsets of a sequential pattern", default=1)
p <- add_argument(p, "--maxti", help="maximum time interval allowed between two succesive itemsets of a sequential pattern", default=5)
p <- add_argument(p, "--minwi", help="minimum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=1)
p <- add_argument(p, "--maxwi", help="maximum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=11)
p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1)
p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=8)

p <- add_argument(p, "--cross", help="cross validation", default=5)
p <- add_argument(p, "--form", help="form", default=1)


argv = data.frame()
if( length(commandArgs(trailingOnly = TRUE)) != 0){
  argv <- parse_args(p)
} else {
  argv <- parse_args(p,c(1,"/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch2_6396te_training.csv","/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch2_6396te_training.csv",6396,6396))
}


TEST_DATA = FALSE
id = argv$id
REMOVE_RARE_EVENTS = argv$rre
REMOVE_FREQUENT_EVENTS = argv$rfe
KEEP_ONLY_FIRST_OCCURENESS = argv$kofe
MULTI_INSTANCE_LEARNING_TEXT = argv$milt #MIL as explained in the text
MULTI_INSTANCE_LEARNING_IMAGE = argv$mili #MIL as presented in the figure
FEATURE_SELECTION = argv$fs
top_features = argv$top
target_event_frequency_proportion_rare = argv$rer
max_event_frequency_proportion_frequent = argv$fer
milw = argv$milw
F_thres = argv$milthres
s = argv$steepness
midpoint = argv$midpoint
target_event = argv$tet
b_length = argv$fet
acceptance_threshold = argv$pthres
export_spm = argv$spme
seed = argv$seed
csv = argv$csv
max_warning_interval = argv$maxwint
min_warning_interval = argv$minwint

CROSS = argv$cross

training_set = read_dataset(argv$train)
test_set =  read_dataset(argv$test)

episodes_list <- preprocess(training_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,FALSE)

TEST_DATA = TRUE
REMOVE_RARE_EVENTS = FALSE
REMOVE_FREQUENT_EVENTS = FALSE
KEEP_ONLY_FIRST_OCCURENESS = FALSE
MULTI_INSTANCE_LEARNING_TEXT = FALSE #MIL as explained in the text
MULTI_INSTANCE_LEARNING_IMAGE = FALSE #MIL as presented in the figure
FEATURE_SELECTION = FALSE
test_episodes_list <- preprocess(test_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,FALSE)


#episodes_list = episodes_list[-which(lapply(episodes_list, nrow)<midpoint)]
#test_episodes_list = test_episodes_list[-which(lapply(test_episodes_list, nrow)<midpoint)]

FEATURE_SELECTION = argv$fs
if(!csv){
  print("~~~~~~~CROSS~~~~~~~")
}
flds = list()
k = argv$cross
ep_length = length(episodes_list)
for (r in (1:k)) {
  for (p in (1:2)){
    cross_name = r
    if(p == 1){
      flds = list(1:round(r*10*ep_length/100))
    } else {
      cross_name = r+k
      flds = list(ceiling(((100-(r*10))*ep_length)/100):ep_length)
    }
    train = episodes_list[-flds[[1]]]
    test = test_episodes_list[flds[[1]]]

    #merge episodes
    merged_episodes = ldply(train, data.frame)
    merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
    #remove columns with all values equal to zero
    merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]

    if(FEATURE_SELECTION){
      merged_episodes = feature_selection(merged_episodes,top_features)
    }

    my.rf = eval(merged_episodes,test,seed,cross_name)
  }
}

# flds = list()
# k = argv$cross
# for (r in (1:k)) {
#   ep_length = length(episodes_list)
#   if (r == 1) {
#     flds[r] = list(1:(r*ep_length/k))
#   } else {
#     flds[r] = list(((r-1)*ep_length/k):(r*ep_length/k))
#   }
#
#   train = episodes_list[-flds[[r]]]
#   test = test_episodes_list[flds[[r]]]
#
#   #merge episodes
#   merged_episodes = ldply(train, data.frame)
#   merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
#   #remove columns with all values equal to zero
#   merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]
#
#   if(FEATURE_SELECTION){
#     merged_episodes = feature_selection(merged_episodes,top_features)
#   }
#
#   my.rf = eval(merged_episodes,test,seed,r)
# }

# for(s in (0:6)){
#   my.rf = eval(merged_episodes,test_episodes_list,seed)
#   seed = seed + 1
# }

# for(ep in 1:length(test_episodes_list)){
#   jpeg(paste(ep,'_rplot.jpg'))
#   plot(test_episodes_list,ep,my.rf)
#   dev.off()
# }

if(export_spm){
  if(!csv){
    print("~~~~~~~SEQUENTIAL PATTERN MINING~~~~~~~")
  }
  spm_train_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$train)
  spm_test_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$test)
  spm_results_path = gsub(".csv",paste("_results_",id,".csv",sep=""),argv$train)
  confidence = argv$conf
  min_dist_seq = argv$minti
  max_dist_seq = argv$maxti
  min_dist_first_last = argv$minwi
  max_dist_first_last = argv$maxwi
  java_path = argv$java
  jspmf_path = argv$spmf
  python_path = argv$python
  cep_path = argv$cep
  max_warning_interval = argv$maxwint
  min_warning_interval = argv$minwint
  export_ds_for_spm(target_event,episodes_list,spm_train_path)
  export_ds_for_spm(target_event,test_episodes_list,spm_test_path)

  if (file.exists(spm_results_path)) {
    invisible(file.remove(spm_results_path))
  }

  javaOutput <- system(paste(java_path,"-jar",jspmf_path,"run HirateYamana",spm_train_path,spm_results_path,confidence,min_dist_seq,max_dist_seq,min_dist_first_last,max_dist_first_last), intern = TRUE)
  #print(javaOutput)

  pythonOutput <- system(paste(python_path,cep_path,spm_results_path,spm_test_path,target_event), intern = TRUE)
  #print(pythonOutput)
  true_positives = 0
  false_positives = 0
  false_negatives = 0
  total_failures = 0
  d = 0

  warnings = list()
  for(w in pythonOutput){
    d = as.integer(str_extract(w, "\\-*\\d+\\.*\\d*"))
    if(!grepl("Failure",w,fixed=TRUE)){
      warnings = c(warnings,d)
    } else {
      total_failures = total_failures + 1
      if(length(warnings) == 0){
        false_negatives = false_negatives + 1
      } else {
        if(length(warnings[warnings < d-max_warning_interval]) > 0){
          false_positives = false_positives + length(warnings[warnings < d-max_warning_interval])
        }
        if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){
          true_positives = true_positives + 1
        } else {
          false_negatives = false_negatives + 1
        }
      }
      warnings = list()
    }
  }

  precision = true_positives/(true_positives+false_positives)
  if((true_positives+false_positives) == 0){
    precision = 0
  }

  recall = true_positives/total_failures

  F1 = 2*((precision*recall)/(precision+recall))
  if((precision+recall) == 0){
    F1 = 0
  }

  if(!csv){
    cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\n"))
  } else {
    cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$conf,",",argv$minti,",",argv$maxti,",",argv$minwi,",",argv$maxwi,",",argv$minwint,",",argv$maxwint, "\n",sep=""))
  }
}