Blame view

supervised/Supervised_Predictor_v5.r 30.1 KB
f19350bf4   Thanasis Naskos   first commit
1
2
3
4
5
6
7
8
9
  suppressMessages(library(CORElearn))
  suppressMessages(library(dplyr))
  suppressMessages(library(plyr))
  suppressMessages(library(data.table))
  suppressMessages(library(randomForest))
  suppressMessages(library(ggplot2))
  suppressMessages(library(grid))
  suppressMessages(library(argparser))
  suppressMessages(library(stringr))
3a64edd55   Thanasis Naskos   fixed blank time ...
10
  suppressMessages(library(lubridate))
f19350bf4   Thanasis Naskos   first commit
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  
  export_ds_for_spm <- function(target_event,episodes_list,output){
    if (file.exists(output)) {
      file.remove(output)
    }
    #output for HirateYamana
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]][ , !(names(episodes_list[[ep_index]]) %in% c("Timestamps"))]
      ep_list = list()
      for(i in (1:nrow(ep))){
        matches = which(ep[i,] %in% c(1))
        if(length(matches) == 0){
          next
        }
        line=paste(matches,collapse=" ")
        ep_list[i] = line
      }
      if(length(ep_list) == 0){
        next
      }
      ep_list[length(ep_list)+1] = target_event
      episode = ""
      for(ep_lli in (1:length(ep_list))){
        if(length(ep_list[[ep_lli]]) > 0){
          index = paste(paste("<",ep_lli,sep=""),">",sep="")
          if(episode == ""){
            episode = paste(index,ep_list[[ep_lli]],sep=" ")
          } else {
            episode = paste(episode,paste(index,ep_list[[ep_lli]],sep=" "),sep=" -1 ")
          }
        }
      }
      write(paste(episode,"-1 -2"),file=output,append=TRUE)
    }
  }
  
  remove_rare_events <- function(ds,target_event_frequency_proportion_rare){
    if(!csv){
      print("~~~~~~~APPLYING PREPROCESSING REMOVE RARE EVENTS~~~~~~~")
    }
    a = table(ds$Event_id)
    target_event_frequency = a[names(a)==target_event][[1]]
    rare_events = as.integer(names(a[a < target_event_frequency*target_event_frequency_proportion_rare]))
    return(ds[!(ds$Event_id %in% rare_events),])
  }
  
  remove_frequent_events <- function(ds,max_event_frequency_proportion_frequent){
    if(!csv){
      print("~~~~~~~APPLYING PREPROCESSING REMOVE FREQUENT EVENTS~~~~~~~")
    }
    a = table(ds[!(ds$Event_id == target_event),]$Event_id)
    max_freq = sort(a,decreasing = TRUE)[[1]]
    frequent_events = as.integer(names(a[a > max_freq*max_event_frequency_proportion_frequent]))
    #print(frequent_events)
    return(ds[!(ds$Event_id %in% frequent_events),])
  }
  
  keep_only_first_occureness <- function(episodes_list){
    if(!csv){
      print("~~~~~~~APPLYING PREPROCESSING KEEP ONLY FIRST OCCURENESS~~~~~~~")
    }
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      #For every segment of each episode starting from the end up to the second segment. 
      #We need to keep only the 1st occurness of consequtive events, hence starting from the end is the easy way.
      if(nrow(ep) < 2){
        next
      }
      for(i in (nrow(ep):2)){
        #as we deal with binary vectors, to find the indeces that both vectors have "1" we sum them and check for "2"s in the result
        matches = which((ep[i,]+ep[i-1,]) %in% c(2))
        #replace the 1s with 0s in the matching positions of the segment that is closer to the end of the episode
        ep[i,][c(matches)] = 0
      }
      episodes_list[[ep_index]] = ep
    }
    return(episodes_list)
  }
  
  mil_text <- function(milw,F_thres,episodes_list,b_length){
    if(!csv){
      print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
    }
    window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
    
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      if(nrow(ep) < 1){
        next
      }
      new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
      i = 1
      while(i <= nrow(ep)){
        new_ep = rbind(new_ep,ep[i,])
3a64edd55   Thanasis Naskos   fixed blank time ...
107
        
f19350bf4   Thanasis Naskos   first commit
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
        if(ep[i,][b_length+1] >= F_thres && nrow(window_df) < milw){
          window_df = rbind(window_df,ep[i,])
        }
        if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
          mean = colMeans(window_df)
          mean[mean > 0] = 1
          mf <- data.frame(as.list(mean))
          mf[1] = ep[i,][1]
          mf[b_length+1] = ep[i,][b_length+1]
          colnames(mf) = colnames(new_ep)
          new_ep = rbind(new_ep,mf)
          if(nrow(window_df) > 1){
            i = i - (nrow(window_df)-2)
          }
          window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
        }
        i = i + 1
      }
      episodes_list[[ep_index]] = new_ep
    }
    return(episodes_list)
  }
  
  mil_image <- function(milw,F_thres,episodes_list,b_length){
    if(!csv){
      print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
    }
    
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      if(nrow(ep) < 1){
        next
      }
      new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
      #a data.frame with the vectors that need to be averaged
      window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
      i = 1
      while(i <= nrow(ep)){
        #new_ep = rbind(new_ep,ep[i,])
        if(nrow(window_df) < milw){
          window_df = rbind(window_df,ep[i,])
        }
        if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
          mean = colMeans(window_df)
          mean[mean > 0] = 1
          mf = data.frame(as.list(mean))
          mf[1] = ep[i,][1]
          mf[b_length+1] = ep[i,][b_length+1]
          #colnames(mf) = colnames(new_ep)
          new_ep = rbind(new_ep,mf)
          if(window_df[1,][b_length+1] >= F_thres && nrow(window_df) > 1){
            i = i - (nrow(window_df)-1)
          }
          window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
        }
        i = i + 1
      }
      episodes_list[[ep_index]] = new_ep
    }
    return(episodes_list)
  }
  
  #the Risk function
  compute_F <- function(s,midpoint,t,ep_length){
    #s affects the steepness
    # s <- 0.9
    return(1/(1+exp(s*(ep_length-midpoint-t))))
  }
  
  #convert event vectors to binary vectors
  compute_frequency_vectors <- function(aggr_episode_df,b_length,s,midpoint){
    freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+2, nrow = 0))
    x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
    # colnames(bin_aggr_episode_df) <- x
f19350bf4   Thanasis Naskos   first commit
183
    for(i in 1:nrow(aggr_episode_df)) {
3a64edd55   Thanasis Naskos   fixed blank time ...
184
      #init a vector with target_event 0s
f19350bf4   Thanasis Naskos   first commit
185
186
187
188
      freq_vector = as.vector(integer(b_length))
      seg <- aggr_episode_df[i,]
      #if segment contains the j number, replace the 0 in the bin_vector with 1
      for(value in seg$x[[1]]){
3a64edd55   Thanasis Naskos   fixed blank time ...
189
190
191
        if(is.na(value)){
          next
        }
f19350bf4   Thanasis Naskos   first commit
192
193
194
195
196
        freq_vector[[value]] = length(which(seg$x[[1]] == value))
      }
      #add a new line to the bin_aggr_epissode_df
      #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types
      F = compute_F(s,midpoint,i-1,nrow(aggr_episode_df))
3a64edd55   Thanasis Naskos   fixed blank time ...
197
      if(midpoint >= nrow(aggr_episode_df) || sum(freq_vector) == 0 ){
f19350bf4   Thanasis Naskos   first commit
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
        F = 0
      }
      date = seg$Timeframe[[1]]
      new_df = data.frame(matrix(c(date, freq_vector,F),nrow=1,ncol=b_length+2))
      freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df)
    }
    # x <- c(c("Timestamps"), c(paste("e_",c(1:3405))), c("Risk_F"))
    colnames(freq_aggr_episode_df) <- x
    return(freq_aggr_episode_df)
  }
  
  create_episodes_list <- function(ds,target_event,b_length,s,midpoint,test_mode){
    if(!csv){
      print("~~~~~~~CREATING FREQUENCY VECTORS AND BINARIZE THEM~~~~~~~")
    }
    #devide in episodes
    target_event_spotted = FALSE
    #a list with data.frames for the episodes (each episode one data.frame)
    episodes_list = list()
    #data.frame for episodes
    episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
    #iterate over every line of the original dataset
    for(i in 1:nrow(ds)) {
      #get the current row of the ds
      meas <- ds[i,]
      #If it is the target event enable the appropriate flag
      if((meas$Event_id == target_event)){
        target_event_spotted = TRUE
      }
      #fill the episode data.frame with the events that are between two target events
      if(meas$Event_id != target_event && target_event_spotted){
        episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id))
      } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) != 0){
3a64edd55   Thanasis Naskos   fixed blank time ...
231
232
        episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=NA))
        
f19350bf4   Thanasis Naskos   first commit
233
234
235
        #a second occurness of the target event is spotted, close the episode
        #target_event_spotted = FALSE
        #aggregate by day all the events to form the segments inside the episodes
3a64edd55   Thanasis Naskos   fixed blank time ...
236
        labels_seq = seq(floor_date(as.POSIXct(head(episode_df,1)$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),'hour'), floor_date(as.POSIXct(tail(episode_df,1)$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),'hour'), by="1 hour")
f19350bf4   Thanasis Naskos   first commit
237
        aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(Timeframe=cut(as.POSIXct(episode_df$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"hour"))) #%Y-%m-%dT%H:%M:%OSZ
3a64edd55   Thanasis Naskos   fixed blank time ...
238
239
240
241
242
243
244
245
246
        
        aggr_episode_df[, 1]  <- as.POSIXct(aggr_episode_df[, 1])
        full = data.frame(Timeframe=labels_seq)
        full[, 1]  <- as.POSIXct(full[, 1])
        aggr_episode_df = merge(full,aggr_episode_df,by='Timeframe',all.x=TRUE)
        
        #aggr_episode_df = head(aggr_episode_df, -1) 
        
        
f19350bf4   Thanasis Naskos   first commit
247
248
249
250
251
        #binarize the frequncy vector
        bin_aggr_episode_df = compute_frequency_vectors(aggr_episode_df,b_length,s,midpoint)
        
        #Remove event 0, which does not provide any info KOUGKA
        bin_aggr_episode_df = bin_aggr_episode_df[ , !(names(bin_aggr_episode_df) %in% c("e_1"))]
3a64edd55   Thanasis Naskos   fixed blank time ...
252

f19350bf4   Thanasis Naskos   first commit
253
254
255
256
        #add the episode to the episodes_list
        episodes_list[[length(episodes_list)+1]] = bin_aggr_episode_df
        #reset episode_df to en empty data.frame
        episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
3a64edd55   Thanasis Naskos   fixed blank time ...
257
      } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) == 0 && length(episodes_list) > 0 && test_mode){
f19350bf4   Thanasis Naskos   first commit
258
259
260
261
262
263
264
        freq_vector = as.vector(integer(b_length))
        new_df = data.frame(matrix(c(0, freq_vector,0),nrow=1,ncol=b_length+2))
        episode_df <- rbind(episode_df,new_df)
        
        x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
        colnames(episode_df) <- x
        episode_df = episode_df[ , !(names(episode_df) %in% c("e_1"))]
3a64edd55   Thanasis Naskos   fixed blank time ...
265
        
f19350bf4   Thanasis Naskos   first commit
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
        #add the episode to the episodes_list
        episodes_list[[length(episodes_list)+1]] = episode_df
        #reset episode_df to en empty data.frame
        episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
      }
    }
    return(episodes_list)
  }
  
  preprocess <- function(ds,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,w,F_thres,test_mode){
    
    #Remove events that appear < n times. We consider n = (target event frequency)/2
    if(REMOVE_RARE_EVENTS){
      ds<-remove_rare_events(ds,target_event_frequency_proportion_rare)
    }
    
    #Remove events that appear < n times. We consider n = (target event frequency)/2
    if(REMOVE_FREQUENT_EVENTS){
      ds<-remove_frequent_events(ds,max_event_frequency_proportion_frequent)
    }
    
    episodes_list = create_episodes_list(ds,target_event,b_length,s,midpoint,test_mode)
    #if(length(episodes_list) == 0){
    # return()
    #}
    
    #binarize the vector
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      ep[2:(ncol(ep)-1)][ep[2:(ncol(ep)-1)] > 0] = 1
      episodes_list[[ep_index]] = ep
    }
    
    # keep only the first occurness of event in consecutive segments
    if(KEEP_ONLY_FIRST_OCCURENESS){
      episodes_list <- keep_only_first_occureness(episodes_list)
    }
    
    # Multi-instance learning to increase the pattern frequency
    if(MULTI_INSTANCE_LEARNING_TEXT){
      episodes_list <- mil_text(w,F_thres,episodes_list,b_length)
    } else if(MULTI_INSTANCE_LEARNING_IMAGE){
      episodes_list <- mil_image(w,F_thres,episodes_list,b_length)
    }
    return(episodes_list)
  }
  
  feature_selection <- function(merged_episodes,top_features){
3a64edd55   Thanasis Naskos   fixed blank time ...
314
315
316
    if(!csv){
      print("~~~~~~~APPLYING FEATURE SELECTION~~~~~~~")
    }
f19350bf4   Thanasis Naskos   first commit
317
318
    estReliefF <- attrEval(Risk_F ~ ., merged_episodes, estimator="RReliefFexpRank", ReliefIterations=50)
    sorted_indeces = order(estReliefF, decreasing = TRUE)
3a64edd55   Thanasis Naskos   fixed blank time ...
319
    merged_episodes = merged_episodes %>% select(sorted_indeces[1:min(top_features,length(sorted_indeces))],ncol(merged_episodes))
f19350bf4   Thanasis Naskos   first commit
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
    return(merged_episodes)
  }
  
  read_dataset <- function(path){
    dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
    dataset[, 2]  <- as.numeric(dataset[, 2])
    return(dataset)
  }
  
  read_dataset_cross <- function(path){
    dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
    dataset[, 2]  <- as.numeric(dataset[, 2])
    
    aggr_episode_df = aggregate(dataset, FUN=function(x){return(x)}, by=list(cut(as.POSIXct(dataset$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"day"))) #%Y-%m-%dT%H:%M:%OSZ
    
    training = sample_n(aggr_episode_df,(2*nrow(aggr_episode_df)/3))
    
    testing = setDT(aggr_episode_df)[!training, on="Group.1"]
    
    testing = testing[, !"Group.1"]
    testing_df = data.frame(Timestamps=unlist(testing[,1][1][[1]]),Event_id=unlist(testing[,2][1][[1]]))
    for(i in 2:nrow(testing)){
      testing_df = rbind(testing_df,data.frame(Timestamps=unlist(testing[,1][i][[1]]),Event_id=unlist(testing[,2][i][[1]])))
    }
    
    training = as.data.table(training)[, !"Group.1"]
    training_df = data.frame(Timestamps=unlist(training[,1][1][[1]]),Event_id=unlist(training[,2][1][[1]]))
    for(i in 2:nrow(training)){
      training_df = rbind(training_df,data.frame(Timestamps=unlist(training[,1][i][[1]]),Event_id=unlist(training[,2][i][[1]])))
    }
    
    return(list("training" = training_df, "testing"=testing_df))
  }
  
  read_dataset_cross2 <- function(path,form){
    dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
    dataset[, 2]  <- as.numeric(dataset[, 2])
    if(form == 1){
      training_df = dataset[1:(2*nrow(dataset)/3),]
      testing_df = dataset[(2*nrow(dataset)/3):nrow(dataset),]
    } else if (form == 2){
      training_df = dataset[(nrow(dataset)/3):nrow(dataset),]
      testing_df = dataset[1:(nrow(dataset)/3),]
    } else {
      training_df = rbind(dataset[1:(nrow(dataset)/3),],dataset[(2*nrow(dataset)/3):nrow(dataset),])
      testing_df = dataset[(nrow(dataset)/3+1):(2*nrow(dataset)/3),]
    }
    return(list("training" = training_df, "testing"=testing_df))
  }
  
  find_next_fake_episodes <- function(ep_index,test_episodes_list){
    fake_episodes_vec = vector()
    cnt = 0
    if(ep_index < length(test_episodes_list)){
      for(i in ((ep_index+1):length(test_episodes_list))){
        n_ep = test_episodes_list[i]
        if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
          cnt = cnt + 1
          fake_episodes_vec <- c(fake_episodes_vec, i)
        } else {
          return(fake_episodes_vec)
        }
      }
    }
    return(fake_episodes_vec)
  }
  
  find_next_close_episodes <- function(ep_index,test_episodes_list,pred_index){
    ep_length = nrow(test_episodes_list[ep_index][[1]])
    remaining_hours = max_warning_interval-(ep_length-pred_index)
3a64edd55   Thanasis Naskos   fixed blank time ...
390
    
f19350bf4   Thanasis Naskos   first commit
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
    nc_episodes_vec = vector()
    cnt = 0
    while (remaining_hours > 0 && ep_index < length(test_episodes_list)) {
      ep_index = ep_index + 1
      n_ep = test_episodes_list[ep_index]
      if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
        next
      }
      n_ep_length = nrow(n_ep[[1]])
      if(n_ep_length <= remaining_hours){
        remaining_hours = remaining_hours-n_ep_length
        if(remaining_hours >= 0){
          cnt = cnt + 1
          nc_episodes_vec <- c(nc_episodes_vec, ep_index)
        }
      } else{
        return(nc_episodes_vec)
      }
    }
    return(nc_episodes_vec)
  }
3a64edd55   Thanasis Naskos   fixed blank time ...
412
  eval <- function(train_episodes,test_episodes_list,seed,r){
f19350bf4   Thanasis Naskos   first commit
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    set.seed(seed)
    my.rf = randomForest(Risk_F ~ .,data=train_episodes,importance=TRUE)
    #varImpPlot(my.rf)
    false_positives = 0
    tp_vec = vector()
    fp_vec = vector()
    fn_vec = vector()
    true_positives = 0
    false_negatives = 0
    ep_index = 1
    while (ep_index <= length(test_episodes_list)) {
      ep = test_episodes_list[[ep_index]]
      fake_episodes_vec = find_next_fake_episodes(ep_index,test_episodes_list)
      fake_ep_cnt = length(fake_episodes_vec)
      
      ep = ep[ , !(names(ep) %in% c("Timestamps"))]
      Prediction <- predict(my.rf, ep)
      ep_legth = length(Prediction)
      pred_indeces = as.numeric(names(Prediction[Prediction >= acceptance_threshold]))
      predicted_next_episodes = 0
      if(length(pred_indeces) > 0){
        pred_index = tail(pred_indeces, n=1)
        nc_episodes_vec = find_next_close_episodes(ep_index,test_episodes_list,pred_index)
f19350bf4   Thanasis Naskos   first commit
436
437
438
439
440
441
442
443
444
445
446
447
448
        predicted_next_episodes = length(nc_episodes_vec)
      }
      
      
      if(length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) > 0){
        fp_reps = length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))])
        false_positives = false_positives + fp_reps + (fake_ep_cnt*fp_reps)
        fp_vec = c(fp_vec,rep(ep_index,fp_reps),rep(fake_episodes_vec,fp_reps))
      } 
      if(length(pred_indeces[pred_indeces >= (ep_legth-(max_warning_interval)) & pred_indeces <= (ep_legth-min_warning_interval)]) > 0){
        true_positives = true_positives + 1 + fake_ep_cnt + predicted_next_episodes
        tp_vec = c(tp_vec,ep_index,fake_episodes_vec,nc_episodes_vec)
      } else {
3a64edd55   Thanasis Naskos   fixed blank time ...
449
450
451
452
        if(predicted_next_episodes > 0){
          true_positives = true_positives + predicted_next_episodes
          tp_vec = c(tp_vec,nc_episodes_vec)
        }
f19350bf4   Thanasis Naskos   first commit
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
        false_negatives = false_negatives + 1 + fake_ep_cnt
        fn_vec = c(fn_vec,ep_index,fake_episodes_vec)
      }
      ep_index = ep_index + 1 + fake_ep_cnt + predicted_next_episodes
    }
    
    precision = true_positives/(true_positives+false_positives)
    if((true_positives+false_positives) == 0){
      precision = 0
    }
    recall = true_positives/length(test_episodes_list)
    
    F1 = 2*((precision*recall)/(precision+recall))
    if((precision+recall) == 0){
      F1 = 0
    }
    if(!csv){
      cat(paste("dataset:",argv$test,"
  true_positives:", true_positives,"
  false_positives:", false_positives,"
  false_negatives:", false_negatives,"
  precision:", precision,"
  recall:", recall,"
  F1:", F1, "
  tp:",paste(as.character(tp_vec), sep="", collapse=","), "
  fp:",paste(as.character(fp_vec), sep="", collapse=","), "
  fn:",paste(as.character(fn_vec), sep="", collapse=",")))
    } else {
3a64edd55   Thanasis Naskos   fixed blank time ...
481
482
      cat(paste(argv$test,",",true_positives,",",false_positives,",",false_negatives,",",precision,",",recall,",",F1,",",argv$fet,",",argv$tet,",",argv$rre,",",argv$rfe,",",argv$kofe,",",argv$mili,",",argv$milt,",",argv$fs,",",argv$top,",",argv$rer,",",argv$fer,",",argv$seed,",",argv$steepness,",",argv$pthres,",",argv$milw,",",argv$milthres,",",argv$midpoint,",",argv$minwint,",",argv$maxwint,",",r,",tp,",paste(as.character(tp_vec), sep="", collapse=","),",fp,",paste(as.character(fp_vec), sep="", collapse=","),",fn,",paste(as.character(fn_vec), sep="", collapse=","),"
  ",sep=""))
f19350bf4   Thanasis Naskos   first commit
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
    }
    return(my.rf)
  }
  
  plot <- function(test_episodes_list, episode_index, my.rf){
    test_episodes = test_episodes_list[[episode_index]][ , !(names(test_episodes_list[[episode_index]]) %in% c("Timestamps"))]
    Prediction <- predict(my.rf, test_episodes)
    results = data.frame(Risk_F=test_episodes$Risk_F,num_Prediction=as.numeric(Prediction))
    mse = mean((Prediction-test_episodes$Risk_F)^2)
    
    chart =ggplot(results,aes((1:nrow(results)))) +
      # geom_rect(aes(xmin = ceiling(nrow(df_test)/2), xmax = nrow(df_test), ymin = -Inf, ymax = Inf),
      #           fill = "yellow", alpha = 0.003) +
      geom_line(aes(y = Risk_F, colour = "Actual")) +
      geom_line(aes(y = num_Prediction, colour="Predicted")) +
      labs(colour="Lines") +
      xlab("Segments") +
      ylab('Risk (F)') +
      ggtitle("Risk Prediction") + # (RR_KF_2YEARS_PAT08)
      theme(plot.title = element_text(hjust = 0.5)) +
      geom_text(aes(label = paste("MSE=",round(mse,3)), x = 20, y = 1), hjust = -2, vjust = 6, color="black", size=4) #add MSE label
    
    # Disable clip-area so that the MSE is shown in the plot
    gt <- ggplot_gtable(ggplot_build(chart))
    gt$layout$clip[gt$layout$name == "panel"] <- "off"
    grid.draw(gt)
  }
  
  
  p <- arg_parser("Implementation of the AIRBUS Predictor")
  # Add a positional argument
  p <- add_argument(p, "id", help="experiment ID")
  p <- add_argument(p, "train", help="training dataset")
  p <- add_argument(p, "test", help="test dataset")
  p <- add_argument(p, "fet", help="different types of the fault events",default=151)
  p <- add_argument(p, "tet", help="type of the target fault events",default=151)
  p <- add_argument(p, "--rre", help="remove rare events", default=FALSE)
  p <- add_argument(p, "--rfe", help="remove frequent events", default=FALSE)
  p <- add_argument(p, "--kofe", help="keep only first event", default=FALSE)
3a64edd55   Thanasis Naskos   fixed blank time ...
522
  p <- add_argument(p, "--milt", help="MIL as written in the text of the paper", default=TRUE)
f19350bf4   Thanasis Naskos   first commit
523
  p <- add_argument(p, "--mili", help="MIL as shonw in the Figure of the paper", default=FALSE)
3a64edd55   Thanasis Naskos   fixed blank time ...
524
  p <- add_argument(p, "--milthres", help="MIL threshold to the sigmoid function for over-sampling", default=0.3)
f19350bf4   Thanasis Naskos   first commit
525
526
  p <- add_argument(p, "--steepness", help="steepness of the sigmoid function", default=0.8)
  p <- add_argument(p, "--midpoint", help="midpoint of the sigmoid function (in days)", default=4)
3a64edd55   Thanasis Naskos   fixed blank time ...
527
528
  p <- add_argument(p, "--fs", help="apply feature selection", default=TRUE)
  p <- add_argument(p, "--top", help="# of features to keep in feature selection", default=50)
f19350bf4   Thanasis Naskos   first commit
529
530
  p <- add_argument(p, "--rer", help="rare events ratio of the target event frequency", default=0.2)
  p <- add_argument(p, "--fer", help="frequent events ratio of the frequency of the most frequent event", default=0.8)
3a64edd55   Thanasis Naskos   fixed blank time ...
531
532
  p <- add_argument(p, "--milw", help="MIL window size (in days)", default=2)
  p <- add_argument(p, "--pthres", help="prediction threshold to the Risk value for a true positive episode", default=0.35)
f19350bf4   Thanasis Naskos   first commit
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
  p <- add_argument(p, "--seed", help="seed for RF", default=500)
  p <- add_argument(p, "--csv", help="output for csv", default=FALSE)
  
  
  p <- add_argument(p, "--spme", help="export datasets for sequential pattern minning", default=FALSE)
  p <- add_argument(p, "--java", help="the java path", default="/usr/bin/java")
  p <- add_argument(p, "--python", help="the java path", default="/usr/bin/python")
  p <- add_argument(p, "--cep", help="the java path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spm_rules.py")
  p <- add_argument(p, "--spmf", help="the spmf path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spmf.jar")
  p <- add_argument(p, "--conf", help="minimum support (minsup)", default="20%")
  p <- add_argument(p, "--minti", help="minimum time interval allowed between two succesive itemsets of a sequential pattern", default=1)
  p <- add_argument(p, "--maxti", help="maximum time interval allowed between two succesive itemsets of a sequential pattern", default=5)
  p <- add_argument(p, "--minwi", help="minimum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=1)
  p <- add_argument(p, "--maxwi", help="maximum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=11)
  p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1)
  p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=8)
3a64edd55   Thanasis Naskos   fixed blank time ...
549
  p <- add_argument(p, "--cross", help="cross validation", default=5)
f19350bf4   Thanasis Naskos   first commit
550
551
552
553
554
555
556
557
  p <- add_argument(p, "--form", help="form", default=1)
  
  
  
  argv = data.frame()
  if( length(commandArgs(trailingOnly = TRUE)) != 0){
    argv <- parse_args(p)
  } else {
3a64edd55   Thanasis Naskos   fixed blank time ...
558
    argv <- parse_args(p,c(1,"/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch2_6396te_training.csv","/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch2_6396te_training.csv",6396,6396))
f19350bf4   Thanasis Naskos   first commit
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
  }
  
  
  TEST_DATA = FALSE
  id = argv$id
  REMOVE_RARE_EVENTS = argv$rre
  REMOVE_FREQUENT_EVENTS = argv$rfe
  KEEP_ONLY_FIRST_OCCURENESS = argv$kofe
  MULTI_INSTANCE_LEARNING_TEXT = argv$milt #MIL as explained in the text
  MULTI_INSTANCE_LEARNING_IMAGE = argv$mili #MIL as presented in the figure
  FEATURE_SELECTION = argv$fs
  top_features = argv$top
  target_event_frequency_proportion_rare = argv$rer
  max_event_frequency_proportion_frequent = argv$fer
  milw = argv$milw
  F_thres = argv$milthres
  s = argv$steepness
  midpoint = argv$midpoint
  target_event = argv$tet
  b_length = argv$fet
  acceptance_threshold = argv$pthres
  export_spm = argv$spme
  seed = argv$seed
  csv = argv$csv
  max_warning_interval = argv$maxwint
  min_warning_interval = argv$minwint
  
  CROSS = argv$cross
3a64edd55   Thanasis Naskos   fixed blank time ...
587
588
  training_set = read_dataset(argv$train)
  test_set =  read_dataset(argv$test)
f19350bf4   Thanasis Naskos   first commit
589
590
  
  episodes_list <- preprocess(training_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,FALSE)
f19350bf4   Thanasis Naskos   first commit
591
592
593
594
595
596
597
  TEST_DATA = TRUE
  REMOVE_RARE_EVENTS = FALSE
  REMOVE_FREQUENT_EVENTS = FALSE
  KEEP_ONLY_FIRST_OCCURENESS = FALSE
  MULTI_INSTANCE_LEARNING_TEXT = FALSE #MIL as explained in the text
  MULTI_INSTANCE_LEARNING_IMAGE = FALSE #MIL as presented in the figure
  FEATURE_SELECTION = FALSE
3a64edd55   Thanasis Naskos   fixed blank time ...
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
  test_episodes_list <- preprocess(test_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,FALSE)
  
  
  #episodes_list = episodes_list[-which(lapply(episodes_list, nrow)<midpoint)]
  #test_episodes_list = test_episodes_list[-which(lapply(test_episodes_list, nrow)<midpoint)]
  
  FEATURE_SELECTION = argv$fs
  if(!csv){
    print("~~~~~~~CROSS~~~~~~~")
  }
  flds = list()
  k = argv$cross
  ep_length = length(episodes_list)
  for (r in (1:k)) {
    for (p in (1:2)){
      cross_name = r
      if(p == 1){
        flds = list(1:round(r*10*ep_length/100))
      } else {
        cross_name = r+k
        flds = list(ceiling(((100-(r*10))*ep_length)/100):ep_length)
      }
      train = episodes_list[-flds[[1]]]
      test = test_episodes_list[flds[[1]]]
      
      #merge episodes
      merged_episodes = ldply(train, data.frame)
      merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
      #remove columns with all values equal to zero
      merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]
      
      if(FEATURE_SELECTION){
        merged_episodes = feature_selection(merged_episodes,top_features)
      }
      
      my.rf = eval(merged_episodes,test,seed,cross_name)
    }
f19350bf4   Thanasis Naskos   first commit
635
  }
3a64edd55   Thanasis Naskos   fixed blank time ...
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
  # flds = list()
  # k = argv$cross
  # for (r in (1:k)) {
  #   ep_length = length(episodes_list)
  #   if (r == 1) {
  #     flds[r] = list(1:(r*ep_length/k))
  #   } else {
  #     flds[r] = list(((r-1)*ep_length/k):(r*ep_length/k))
  #   }
  #   
  #   train = episodes_list[-flds[[r]]]
  #   test = test_episodes_list[flds[[r]]]
  #   
  #   #merge episodes
  #   merged_episodes = ldply(train, data.frame)
  #   merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
  #   #remove columns with all values equal to zero
  #   merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]
  #   
  #   if(FEATURE_SELECTION){
  #     merged_episodes = feature_selection(merged_episodes,top_features)
  #   }
  #   
  #   my.rf = eval(merged_episodes,test,seed,r)
  # }
f19350bf4   Thanasis Naskos   first commit
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
  
  # for(s in (0:6)){
  #   my.rf = eval(merged_episodes,test_episodes_list,seed)
  #   seed = seed + 1
  # }
  
  # for(ep in 1:length(test_episodes_list)){
  #   jpeg(paste(ep,'_rplot.jpg'))
  #   plot(test_episodes_list,ep,my.rf)
  #   dev.off()
  # }
  
  if(export_spm){
    if(!csv){
      print("~~~~~~~SEQUENTIAL PATTERN MINING~~~~~~~")
    }
    spm_train_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$train)
    spm_test_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$test)
    spm_results_path = gsub(".csv",paste("_results_",id,".csv",sep=""),argv$train)
    confidence = argv$conf
    min_dist_seq = argv$minti
    max_dist_seq = argv$maxti
    min_dist_first_last = argv$minwi
    max_dist_first_last = argv$maxwi
    java_path = argv$java
    jspmf_path = argv$spmf
    python_path = argv$python
    cep_path = argv$cep
    max_warning_interval = argv$maxwint
    min_warning_interval = argv$minwint 
    export_ds_for_spm(target_event,episodes_list,spm_train_path)
    export_ds_for_spm(target_event,test_episodes_list,spm_test_path)
    
    if (file.exists(spm_results_path)) {
      invisible(file.remove(spm_results_path))
    }
    
    javaOutput <- system(paste(java_path,"-jar",jspmf_path,"run HirateYamana",spm_train_path,spm_results_path,confidence,min_dist_seq,max_dist_seq,min_dist_first_last,max_dist_first_last), intern = TRUE)
    #print(javaOutput)
    
    pythonOutput <- system(paste(python_path,cep_path,spm_results_path,spm_test_path,target_event), intern = TRUE)
    #print(pythonOutput)
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    total_failures = 0
    d = 0
    
    warnings = list()
    for(w in pythonOutput){
      d = as.integer(str_extract(w, "\\-*\\d+\\.*\\d*"))
      if(!grepl("Failure",w,fixed=TRUE)){
        warnings = c(warnings,d)
      } else {
        total_failures = total_failures + 1
        if(length(warnings) == 0){
          false_negatives = false_negatives + 1
        } else {
          if(length(warnings[warnings < d-max_warning_interval]) > 0){
            false_positives = false_positives + length(warnings[warnings < d-max_warning_interval])
          }
          if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){
            true_positives = true_positives + 1
          } else {
            false_negatives = false_negatives + 1
          }
        }
        warnings = list()
      }
    }
    
    precision = true_positives/(true_positives+false_positives)
    if((true_positives+false_positives) == 0){
      precision = 0
    }
    
    recall = true_positives/total_failures
    
    F1 = 2*((precision*recall)/(precision+recall))
    if((precision+recall) == 0){
      F1 = 0
    }
    
    if(!csv){
      cat(paste("dataset:",argv$test,"
  true_positives:", true_positives,"
  false_positives:", false_positives,"
  false_negatives:", false_negatives,"
  precision:", precision,"
  recall:", recall,"
  F1:", F1, "
  "))
    } else {
      cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$conf,",",argv$minti,",",argv$maxti,",",argv$minwi,",",argv$maxwi,",",argv$minwint,",",argv$maxwint, "
  ",sep=""))
    }
  }