## PART 0 - SCRIPT INFORMATION SECTION ----------------------------------------

#' Script to paste seismic data snippets into hourly file templates
#' 
#' This script can be used to generate regularly spaced hourly SAC files from 
#' a set of shorter file snippets. Data gaps are interpolated linearly. The 
#' script was written as a patch solver. It should be used with care and 
#' the scientist should be aware of the consequences of the data manipulation. 
#' 
#' Author: Michael Dietze, Section 4.6 GFZ Potsdam (mdietze@gfz-potsdam.de)
#' 
#' Script version: 0.1.0, 05 Spetember 2019
#' 
#' Changes to previous versions
#'   - not applicable
#' 
#' Requirements & dependencies: 
#'   - R 3.4.4
#'   - eseis 0.5.0

## PART 1 - DEFINITION OF OBJECTS AND IMPORT OF DATA --------------------------

## load required package
library(eseis)

## define input and output directories
dir_sac = "/sec51/data/mdietze/2018_hochvogel/data/seismic/sac_redate/"
dir_out <- "/sec51/data/mdietze/2018_hochvogel/data/seismic/sac_antenna_hourly/"

## define time range to be processed
t_range <- as.POSIXct(x = c("2018-07-09",
                            "2018-08-01"), 
                      tz = "UTC")

## read station infor file
stations <- read.table(file = paste0("/sec51/data/mdietze/2018_hochvogel/",
                                     "data/seismic/station/station_info_",
                                     "HVGL18_final_summit_utm_01.txt"), 
                       header = TRUE, 
                       stringsAsFactors = FALSE)[-7,]


## PART 2 - PROCESSING OF FILES -----------------------------------------------

## assign station IDs
ID_sac <- stations$ID

## list all files to process
files_sac <- list.files(path = dir_sac,
                        recursive = TRUE,
                        full.names = TRUE, 
                        pattern = ".SAC")

## isolate sac file name from path
files_sac_name <- do.call(c, lapply(X = files_sac, FUN = function(x) {
  
  strsplit(x = x, split = "/")[[1]][12]
}))

## convert file name to time stamp
files_sac_time <- as.POSIXct(x = substr(x = files_sac_name, 
                                        start = 7, 
                                        stop = 21), 
                             format = "%y.%j.%H.%M.%S", 
                             tz = "UTC")

## create vector of hourly time snippets of output files
t <- seq(from = t_range[1], 
         to = t_range[2], 
         by = 3600)

## loop through all hourly time segments
for(i in 16:length(t)) {
  
  ## create empty output data object
  s_empty <- lapply(X = 1:6, FUN = function(a) {
    
    return(rep(NA, times = 3600 * 200))
  })
  
  ## assign station IDs to output object
  names(s_empty) <- ID_sac
  
  ## generate time vector
  t_empty <- seq(from = t[i],
                 by  = 1/200, 
                 length.out = 3600 * 200)
  
  ## isolate input files that can fill the output object
  files_sac_i <- files_sac[files_sac_time >= t[i] - 6 * 60 &
                             files_sac_time <= (t[i] + 3600)]
  
  ## remove NA values, if present
  files_sac_i <- na.exclude(files_sac_i)
  
  ## continue if any files are present for time snippet to work on
  if(length(files_sac_i) > 0) {
    
    ## process each station individually
    for(j in 1:length(ID_sac)) {
      
      ## isolate files for station to work on
      files_sac_i_j <- files_sac_i[grepl(x = files_sac_i,
                                         pattern = ID_sac[j])]
      
      ## process each file individually
      for(k in 1:length(files_sac_i_j)) {
        
        ## read and prepare files
        s <- eseis::read_sac(file = files_sac_i_j[k])
        
        ## define file-based start and end times of the data
        s_start <- s$meta$starttime
        s_stop <- s_start + s$meta$dt * s$meta$n
        
        ## toggle indices with data TRUE
        t_paste <- t_empty >= s_start & t_empty < s_stop
        
        ## fill signal vector with existing data
        if(sum(t_paste) > 0) {
          
          s_empty[[j]][t_paste] <- s$signal[1:sum(t_paste)]
        }
        
      }
      
      ## identify data gaps
      s_na <- is.na(s_empty[[j]])
      
      ## interpolate data gaps if present
      if(sum(s_na) > 0) {
        
        ## account for first and last value being NA
        if(s_na[1] == TRUE) {
          s_na[1] <- FALSE
          s_empty[[j]][1] <- mean(s_empty[[j]], na.rm = TRUE)
        }
        
        if(s_na[length(s_na)] == TRUE) {
          s_na[length(s_na)] <- FALSE
          s_empty[[j]][length(s_na)] <- mean(s_empty[[j]], na.rm = TRUE)
        }
        
        ## find transitions to NA
        s_na_trans <- c(FALSE, diff(s_na))
        
        ## identify start and end points of data gaps
        s_na_start <- which(s_na_trans == 1) - 1
        s_na_stop <- which(s_na_trans == -1)
        
        ## if gaps exist, fill them with linear sequence from margins
        if(length(s_na_start) > 0) {
          
          for(k in 1:length(s_na_start)) {
            
            s_empty[[j]][s_na_start[k]:s_na_stop[k]] <- 
              seq(from = s_empty[[j]][s_na_start[k]], 
                  to = s_empty[[j]][s_na_stop[k]], 
                  length.out = length(s_na_start[k]:s_na_stop[k]))
          }
        }
      }
    }
    
    ## check/create output directory year
    try(if(dir.exists(paths = paste0(dir_out,
                                     format(x = t[i], 
                                            format = "%Y"))) == FALSE) {
      
      dir.create(paste0(dir_out, format(x = t[i], format = "%Y")))
    })
    
    ## check/create output directory JD
    try(if(dir.exists(paths = paste0(dir_out,
                                     format(x = t[i], 
                                            format = "%Y/%j"))) == FALSE) {
      
      dir.create(paste0(dir_out, format(x = t[i], format = "%Y/%j")))
    })
    
    ## save merged hourly data sets
    for(j in 1:length(s_empty)) {
      
      write_sac(data = s_empty[[j]], 
                file = paste0(dir_out, 
                              format(x = t[i], 
                                     format = "%Y/%j/"),
                              names(s_empty)[j], ".",
                              format(x = t[i], 
                                     format = "%y.%j.%H.%M.%S"),
                              ".BHZ.SAC"), 
                time = t[i], 
                component = "BHZ", 
                unit = 1, 
                station = names(s_empty)[j], 
                network = "HV", 
                dt = 1/200)
    }
  }
  
  ## print progress
  print(paste(i, "of", length(t)))
}