Blame view

AIRBUS_Predictor_v3.r 22.2 KB
4eb80aaaa   Thanasis Naskos   init commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
  suppressMessages(library(CORElearn))
  suppressMessages(library(dplyr))
  suppressMessages(library(plyr))
  suppressMessages(library(data.table))
  suppressMessages(library(randomForest))
  suppressMessages(library(ggplot2))
  suppressMessages(library(grid))
  suppressMessages(library(argparser))
  suppressMessages(library(stringr))
  
  export_ds_for_spm <- function(target_event,episodes_list,output){
    if (file.exists(output)) {
      file.remove(output)
    }
    #output for HirateYamana
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]][ , !(names(episodes_list[[ep_index]]) %in% c("Timestamps"))]
      ep_list = list()
      for(i in (1:nrow(ep))){
        matches = which(ep[i,] %in% c(1))
        if(length(matches) == 0){
          next
        }
        line=paste(matches,collapse=" ")
        ep_list[i] = line
      }
      if(length(ep_list) == 0){
        next
      }
      ep_list[length(ep_list)+1] = target_event
      episode = ""
      for(ep_lli in (1:length(ep_list))){
        if(length(ep_list[[ep_lli]]) > 0){
          index = paste(paste("<",ep_lli,sep=""),">",sep="")
          if(episode == ""){
            episode = paste(index,ep_list[[ep_lli]],sep=" ")
          } else {
            episode = paste(episode,paste(index,ep_list[[ep_lli]],sep=" "),sep=" -1 ")
          }
        }
      }
      write(paste(episode,"-1 -2"),file=output,append=TRUE)
    }
  }
  
  remove_rare_events <- function(ds,target_event_frequency_proportion_rare){
    if(!csv){
      print("~~~~~~~APPLYING PROPREPROCESSING: REMOVE RARE EVENTS~~~~~~~")
    }
    a = table(ds$Event_id)
    target_event_frequency = a[names(a)==target_event]
    rare_events = as.integer(names(a[a < target_event_frequency*target_event_frequency_proportion_rare]))
    return(ds[!(ds$Event_id %in% rare_events),])
  }
  
  remove_frequent_events <- function(ds,max_event_frequency_proportion_frequent){
    if(!csv){
      print("~~~~~~~APPLYING PROPREPROCESSING: REMOVE FREQUENT EVENTS~~~~~~~")
    }
    a = table(ds$Event_id)
    max_freq = sort(a,decreasing = TRUE)[[1]]
    frequent_events = as.integer(names(a[a > max_freq*max_event_frequency_proportion_frequent]))
    return(ds[!(ds$Event_id %in% frequent_events),])
  }
  
  keep_only_first_occureness <- function(episodes_list){
    if(!csv){
      print("~~~~~~~APPLYING PROPREPROCESSING: KEEP ONLY FIRST OCCURENESS~~~~~~~")
    }
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      #For every segment of each episode starting from the end up to the second segment. 
      #We need to keep only the 1st occurness of consequtive events, hence starting from the end is the easy way.
      for(i in (nrow(ep):2)){
        #as we deal with binary vectors, to find the indeces that both vectors have "1" we sum them and check for "2"s in the result
        matches = which((ep[i,]+ep[i-1,]) %in% c(2))
        #replace the 1s with 0s in the matching positions of the segment that is closer to the end of the episode
        ep[i,][c(matches)] = 0
      }
      episodes_list[[ep_index]] = ep
    }
    return(episodes_list)
  }
  
  mil_text <- function(milw,F_thres,episodes_list,b_length){
    if(!csv){
      print("~~~~~~~APPLYING PROPREPROCESSING: MULTI INSTANCE LEARNING~~~~~~~")
    }
    window_df = data.frame(matrix(ncol = b_length+2, nrow = 0))
    
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      new_ep = data.frame(matrix(ncol = b_length+2, nrow = 0))
      i = 1
      while(i <= nrow(ep)){
        new_ep = rbind(new_ep,ep[i,])
        if(ep[i,][b_length+2] >= F_thres && nrow(window_df) < milw){
          window_df = rbind(window_df,ep[i,])
        }
        if(nrow(window_df) == milw || i == nrow(ep)){
          mean = colMeans(window_df)
          mean[mean > 0] = 1
          mf = data.frame(as.list(mean))
          mf[1] = ep[i,][1]
          mf[b_length+2] = ep[i,][b_length+2]
          #colnames(mf) = colnames(new_ep)
          new_ep = rbind(new_ep,mf)
          if(nrow(window_df) > 1){
            i = i - (nrow(window_df)-2)
          }
          window_df = data.frame(matrix(ncol = b_length+2, nrow = 0))
        }
        i = i + 1
      }
      episodes_list[[ep_index]] = new_ep
    }
    return(episodes_list)
  }
  
  mil_image <- function(milw,F_thres,episodes_list,b_length){
    if(!csv){
      print("~~~~~~~APPLYING PROPREPROCESSING: MULTI INSTANCE LEARNING~~~~~~~")
    }
  
    #for every episode in the episodes_list
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      new_ep = data.frame(matrix(ncol = b_length+2, nrow = 0))
      #a data.frame with the vectors that need to be averaged
      window_df = data.frame(matrix(ncol = b_length+2, nrow = 0))
      i = 1
      while(i <= nrow(ep)){
        #new_ep = rbind(new_ep,ep[i,])
        if(nrow(window_df) < milw){
          window_df = rbind(window_df,ep[i,])
        }
        if(nrow(window_df) == milw || i == nrow(ep)){
          mean = colMeans(window_df)
          mean[mean > 0] = 1
          mf = data.frame(as.list(mean))
          mf[1] = ep[i,][1]
          mf[b_length+2] = ep[i,][b_length+2]
          #colnames(mf) = colnames(new_ep)
          new_ep = rbind(new_ep,mf)
          if(window_df[1,][b_length+2] >= F_thres && nrow(window_df) > 1){
            i = i - (nrow(window_df)-1)
          }
          window_df = data.frame(matrix(ncol = b_length+2, nrow = 0))
        }
        i = i + 1
      }
      episodes_list[[ep_index]] = new_ep
    }
    return(episodes_list)
  }
  
  #the Risk function
  compute_F <- function(s,midpoint,t,ep_length){
    #s affects the steepness
    # s <- 0.9
    return(1/(1+exp(s*(ep_length-midpoint-t))))
  }
  
  #convert event vectors to binary vectors
  compute_frequency_vectors <- function(aggr_episode_df,b_length,s,midpoint){
    freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+2, nrow = 0))
    x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
    # colnames(bin_aggr_episode_df) <- x
    
    for(i in 1:nrow(aggr_episode_df)) {
      #init a vector with 3405 0s
      freq_vector = as.vector(integer(b_length))
      seg <- aggr_episode_df[i,]
      #if segment contains the j number, replace the 0 in the bin_vector with 1
      for(value in seg$x[[1]]){
        freq_vector[[value]] = length(which(seg$x[[1]] == value))
      }
      #add a new line to the bin_aggr_epissode_df
      #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types
      F = compute_F(s,midpoint,i-1,nrow(aggr_episode_df))
      date = seg$Timeframe[[1]]
      new_df = data.frame(matrix(c(date, freq_vector,F),nrow=1,ncol=b_length+2))
      freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df)
    }
    # x <- c(c("Timestamps"), c(paste("e_",c(1:3405))), c("Risk_F"))
    colnames(freq_aggr_episode_df) <- x
    return(freq_aggr_episode_df)
  }
  
  create_episodes_list <- function(ds,target_event,b_length,s,midpoint){
    if(!csv){
      print("~~~~~~~CREATING FREQUENCY VECTORS AND BINARIZE THEM~~~~~~~")
    }
    #devide in episodes
    target_event_spotted = FALSE
    #a list with data.frames for the episodes (each episode one data.frame)
    episodes_list = list()
    #data.frame for episodes
    episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
    #iterate over every line of the original dataset
    for(i in 1:nrow(ds)) {
      #get the current row of the ds
      meas <- ds[i,]
      #If it is the target event enable the appropriate flag
      if((meas$Event_id == target_event)){
        target_event_spotted = TRUE
      }
      #fill the episode data.frame with the events that are between two target events
      if(meas$Event_id != target_event && target_event_spotted){
        episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id))
      } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) != 0){
        #a second occurness of the target event is spotted, close the episode
        #target_event_spotted = FALSE
        #aggregate by day all the events to form the segments inside the episodes
        aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(Timeframe=cut(as.POSIXct(episode_df$Timestamps, format="%Y-%m-%d"),"day"))) #%Y-%m-%dT%H:%M:%OSZ
        #binarize the frequncy vector
        bin_aggr_episode_df = compute_frequency_vectors(aggr_episode_df,b_length,s,midpoint)
        
        #Remove event 0, which does not provide any info KOUGKA
        #bin_aggr_episode_df = bin_aggr_episode_df[ , !(names(bin_aggr_episode_df) %in% c("e_1"))]
        
        #add the episode to the episodes_list
        episodes_list[[length(episodes_list)+1]] = bin_aggr_episode_df
        #reset episode_df to en empty data.frame
        episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
      }
    }
    return(episodes_list)
  }
  
  preprocess <- function(ds,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,w,F_thres){
    
    #Remove events that appear < n times. We consider n = (target event frequency)/2
    if(REMOVE_RARE_EVENTS){
      ds<-remove_rare_events(ds,target_event_frequency_proportion_rare)
    }
    
    #Remove events that appear < n times. We consider n = (target event frequency)/2
    if(REMOVE_FREQUENT_EVENTS){
      ds<-remove_frequent_events(ds,max_event_frequency_proportion_frequent)
    }
    
    episodes_list = create_episodes_list(ds,target_event,b_length,s,midpoint)
    
    #binarize the vector
    for(ep_index in (1:length(episodes_list))){
      ep = episodes_list[[ep_index]]
      ep[2:(ncol(ep)-1)][ep[2:(ncol(ep)-1)] > 0] = 1
      episodes_list[[ep_index]] = ep
    }
    
    # keep only the first occurness of event in consecutive segments
    if(KEEP_ONLY_FIRST_OCCURENESS){
      episodes_list <- keep_only_first_occureness(episodes_list)
    }
    
    # Multi-instance learning to increase the pattern frequency
    if(MULTI_INSTANCE_LEARNING_TEXT){
      episodes_list <- mil_text(w,F_thres,episodes_list,b_length)
    } else if(MULTI_INSTANCE_LEARNING_IMAGE){
      episodes_list <- mil_image(w,F_thres,episodes_list,b_length)
    }
    return(episodes_list)
  }
  
  feature_selection <- function(merged_episodes,top_features){
      estReliefF <- attrEval(Risk_F ~ ., merged_episodes, estimator="RReliefFexpRank", ReliefIterations=50)
      sorted_indeces = order(estReliefF, decreasing = TRUE)
      merged_episodes = merged_episodes %>% select(sorted_indeces[1:top_features],ncol(merged_episodes))
      return(merged_episodes)
  }
  
  read_dataset <- function(path){
    dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
    dataset[, 2]  <- as.numeric(dataset[, 2])
    return(dataset)
  }
  
  eval <- function(train_episodes,test_episodes_list,seed){
    set.seed(seed)
    my.rf = randomForest(Risk_F ~ .,data=train_episodes,importance=TRUE)
    #varImpPlot(my.rf)
    false_positives = 0
    true_positives = 0
    false_negatives = 0
    for(ep in test_episodes_list){
      ep = ep[ , !(names(ep) %in% c("Timestamps"))]
      Prediction <- predict(my.rf, ep)
      ep_legth = length(Prediction)
      pred_indeces = as.numeric(names(Prediction[Prediction >= acceptance_threshold]))
      if(length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) > 0){
        false_positives = false_positives + length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))])
      } 
      if(length(pred_indeces[pred_indeces >= (ep_legth-(max_warning_interval)) & pred_indeces <= (ep_legth-min_warning_interval)]) > 0){
        true_positives = true_positives + 1
      } else {
        false_negatives = false_negatives + 1
      }
    }
    
    precision = true_positives/(true_positives+false_positives)
    if((true_positives+false_positives) == 0){
      precision = 0
    }
    recall = true_positives/length(test_episodes_list)
    
    F1 = 2*((precision*recall)/(precision+recall))
    if((precision+recall) == 0){
      F1 = 0
    }
    if(!csv){
      cat(paste("dataset:",argv$test,"
  true_positives:", true_positives,"
  false_positives:", false_positives,"
  false_negatives:", false_negatives,"
  precision:", precision,"
  recall:", recall,"
  F1:", F1, "
  "))
    } else {
      cat(paste(argv$test,",",true_positives,",",false_positives,",",false_negatives,",",precision,",",recall,",",F1,",",argv$fet,",",argv$tet,",",argv$rre,",",argv$rfe,",",argv$kofe,",",argv$mili,",",argv$milt,",",argv$fs,",",argv$top,",",argv$rer,",",argv$fer,",",argv$seed,",",argv$steepness,",",argv$pthres,",",argv$milw,",",argv$milthres,",",argv$midpoint,",",argv$minwint,",",argv$maxwint,"
  ",sep=""))
    }
    return(my.rf)
  }
  
  plot <- function(test_episodes_list, episode_index, my.rf){
    test_episodes = test_episodes_list[[episode_index]][ , !(names(test_episodes_list[[episode_index]]) %in% c("Timestamps"))]
    Prediction <- predict(my.rf, test_episodes)
    results = data.frame(Risk_F=test_episodes$Risk_F,num_Prediction=as.numeric(Prediction))
    mse = mean((Prediction-test_episodes$Risk_F)^2)
    
    chart =ggplot(results,aes((1:nrow(results)))) +
      # geom_rect(aes(xmin = ceiling(nrow(df_test)/2), xmax = nrow(df_test), ymin = -Inf, ymax = Inf),
      #           fill = "yellow", alpha = 0.003) +
      geom_line(aes(y = Risk_F, colour = "Actual")) +
      geom_line(aes(y = num_Prediction, colour="Predicted")) +
      labs(colour="Lines") +
      xlab("Segments") +
      ylab('Risk (F)') +
      ggtitle("Risk Prediction") + # (RR_KF_2YEARS_PAT08)
      theme(plot.title = element_text(hjust = 0.5)) +
      geom_text(aes(label = paste("MSE=",round(mse,3)), x = 20, y = 1), hjust = -2, vjust = 6, color="black", size=4) #add MSE label
    
    # Disable clip-area so that the MSE is shown in the plot
    gt <- ggplot_gtable(ggplot_build(chart))
    gt$layout$clip[gt$layout$name == "panel"] <- "off"
    grid.draw(gt)
  }
  
  
  p <- arg_parser("Implementation of the AIRBUS Predictor")
  # Add a positional argument
  p <- add_argument(p, "id", help="experiment ID")
  p <- add_argument(p, "train", help="training dataset")
  p <- add_argument(p, "test", help="test dataset")
  p <- add_argument(p, "fet", help="different types of the fault events",default=151)
  p <- add_argument(p, "tet", help="type of the target fault events",default=151)
  p <- add_argument(p, "--rre", help="remove rare events", default=TRUE)
  p <- add_argument(p, "--rfe", help="remove frequent events", default=TRUE)
  p <- add_argument(p, "--kofe", help="keep only first event", default=TRUE)
  p <- add_argument(p, "--milt", help="MIL as written in the text of the paper", default=TRUE)
  p <- add_argument(p, "--mili", help="MIL as shonw in the Figure of the paper", default=FALSE)
  p <- add_argument(p, "--milthres", help="MIL threshold to the sigmoid function for over-sampling", default=0.4)
  p <- add_argument(p, "--steepness", help="steepness of the sigmoid function", default=0.7)
  p <- add_argument(p, "--midpoint", help="midpoint of the sigmoid function (in days)", default=11)
  p <- add_argument(p, "--fs", help="apply feature selection", default=FALSE)
  p <- add_argument(p, "--top", help="# of features to keep in feature selection", default=200)
  p <- add_argument(p, "--rer", help="rare events ratio of the target event frequency", default=0.5)
  p <- add_argument(p, "--fer", help="frequent events ratio of the frequency of the most frequent event", default=0.8)
  p <- add_argument(p, "--milw", help="MIL window size (in days)", default=6)
  p <- add_argument(p, "--pthres", help="prediction threshold to the Risk value for a true positive episode", default=0.5)
  p <- add_argument(p, "--seed", help="seed for RF", default=500)
  p <- add_argument(p, "--csv", help="output for csv", default=TRUE)
  
  
  p <- add_argument(p, "--spme", help="export datasets for sequential pattern minning", default=FALSE)
  p <- add_argument(p, "--java", help="the java path", default="/usr/bin/java")
  p <- add_argument(p, "--python", help="the java path", default="/usr/bin/python")
  p <- add_argument(p, "--cep", help="the java path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spm_rules.py")
  p <- add_argument(p, "--spmf", help="the spmf path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spmf.jar")
  p <- add_argument(p, "--conf", help="minimum support (minsup)", default="20%")
  p <- add_argument(p, "--minti", help="minimum time interval allowed between two succesive itemsets of a sequential pattern", default=1)
  p <- add_argument(p, "--maxti", help="maximum time interval allowed between two succesive itemsets of a sequential pattern", default=5)
  p <- add_argument(p, "--minwi", help="minimum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=1)
  p <- add_argument(p, "--maxwi", help="maximum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=11)
  p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1)
  p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=22)
  
  argv = data.frame()
  if( length(commandArgs(trailingOnly = TRUE)) != 0){
    argv <- parse_args(p)
  } else {
    argv <- parse_args(p,c(1,"training_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv","testing_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv",151,151))
  }
  
  
  TEST_DATA = FALSE
  id = argv$id
  REMOVE_RARE_EVENTS = argv$rre
  REMOVE_FREQUENT_EVENTS = argv$rfe
  KEEP_ONLY_FIRST_OCCURENESS = argv$kofe
  MULTI_INSTANCE_LEARNING_TEXT = argv$milt #MIL as explained in the text
  MULTI_INSTANCE_LEARNING_IMAGE = argv$mili #MIL as presented in the figure
  FEATURE_SELECTION = argv$fs
  top_features = argv$top
  target_event_frequency_proportion_rare = argv$rer
  max_event_frequency_proportion_frequent = argv$fer
  milw = argv$milw
  F_thres = argv$milthres
  s = argv$steepness
  midpoint = argv$midpoint
  target_event = argv$tet
  b_length = argv$fet
  acceptance_threshold = argv$pthres
  export_spm = argv$spme
  seed = argv$seed
  csv = argv$csv
  max_warning_interval = argv$maxwint
  min_warning_interval = argv$minwint
  
  training_set = read_dataset(argv$train)
  test_set =  read_dataset(argv$test)
  episodes_list <- preprocess(training_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres)
  
  #merge episodes
  merged_episodes = ldply(episodes_list, data.frame)
  merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
  
  
  if(FEATURE_SELECTION){
    #remove columns with all values equal to zero
    merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]
    merged_episodes = feature_selection(merged_episodes,top_features)
  }
  
  TEST_DATA = TRUE
  REMOVE_RARE_EVENTS = FALSE
  REMOVE_FREQUENT_EVENTS = FALSE
  KEEP_ONLY_FIRST_OCCURENESS = FALSE
  MULTI_INSTANCE_LEARNING_TEXT = FALSE #MIL as explained in the text
  MULTI_INSTANCE_LEARNING_IMAGE = FALSE #MIL as presented in the figure
  FEATURE_SELECTION = FALSE
  test_episodes_list <- preprocess(test_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres)
  
  my.rf = eval(merged_episodes,test_episodes_list,seed)
  
  # for(s in (0:6)){
  #   my.rf = eval(merged_episodes,test_episodes_list,seed)
  #   seed = seed + 1
  # }
  
  # for(ep in 1:length(test_episodes_list)){
  #   jpeg(paste(ep,'_rplot.jpg'))
  #   plot(test_episodes_list,ep,my.rf)
  #   dev.off()
  # }
  
  if(export_spm){
    if(!csv){
      print("~~~~~~~SEQUENTIAL PATTERN MINING~~~~~~~")
    }
    spm_train_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$train)
    spm_test_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$test)
    spm_results_path = gsub(".csv",paste("_results_",id,".csv",sep=""),argv$train)
    confidence = argv$conf
    min_dist_seq = argv$minti
    max_dist_seq = argv$maxti
    min_dist_first_last = argv$minwi
    max_dist_first_last = argv$maxwi
    java_path = argv$java
    jspmf_path = argv$spmf
    python_path = argv$python
    cep_path = argv$cep
    max_warning_interval = argv$maxwint
    min_warning_interval = argv$minwint 
    export_ds_for_spm(target_event,episodes_list,spm_train_path)
    export_ds_for_spm(target_event,test_episodes_list,spm_test_path)
    
    if (file.exists(spm_results_path)) {
      invisible(file.remove(spm_results_path))
    }
  
    javaOutput <- system(paste(java_path,"-jar",jspmf_path,"run HirateYamana",spm_train_path,spm_results_path,confidence,min_dist_seq,max_dist_seq,min_dist_first_last,max_dist_first_last), intern = TRUE)
    #print(javaOutput)
    
    pythonOutput <- system(paste(python_path,cep_path,spm_results_path,spm_test_path,target_event), intern = TRUE)
    #print(pythonOutput)
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    total_failures = 0
    d = 0
    
    warnings = list()
    for(w in pythonOutput){
      d = as.integer(str_extract(w, "\\-*\\d+\\.*\\d*"))
      if(!grepl("Failure",w,fixed=TRUE)){
        warnings = c(warnings,d)
      } else {
        total_failures = total_failures + 1
        if(length(warnings) == 0){
          false_negatives = false_negatives + 1
        } else {
          if(length(warnings[warnings < d-max_warning_interval]) > 0){
            false_positives = false_positives + length(warnings[warnings < d-max_warning_interval])
          }
          if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){
            true_positives = true_positives + 1
          } else {
            false_negatives = false_negatives + 1
          }
        }
        warnings = list()
      }
    }
    
    precision = true_positives/(true_positives+false_positives)
    if((true_positives+false_positives) == 0){
      precision = 0
    }
  
    recall = true_positives/total_failures
    
    F1 = 2*((precision*recall)/(precision+recall))
    if((precision+recall) == 0){
      F1 = 0
    }
  
    if(!csv){
      cat(paste("dataset:",argv$test,"
  true_positives:", true_positives,"
  false_positives:", false_positives,"
  false_negatives:", false_negatives,"
  precision:", precision,"
  recall:", recall,"
  F1:", F1, "
  "))
    } else {
      cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$conf,",",argv$minti,",",argv$maxti,",",argv$minwi,",",argv$maxwi,",",argv$minwint,",",argv$maxwint, "
  ",sep=""))
    }
  }