
library(tidyverse)
library(data.table)
library(arrow)

# Creating year variables for to pass functions through when importing the data
years <- c("0102", "0203", "0304", "0405", "0506", "0607", "0708", "0809", "0910", "1011", "1112", "1213", "1314", "1415",
           "1516", "1617", "1718", "1819", "1920", "2021", "2122", "2223", "2324")

# Colleting BAS data

# List to be filled
bas_eum_company_data <- list()

for(year in years){
  print(paste0("Processing: ", year)) # See processing progress
  bas_data <- fread(paste0("R:/blade-standard/30 June 2023 (Longitudinal EUM)/BAS/CSV/blade2223_bas_id_", year, ".csv"))
  
  # Converting most variables to double so that they bind (merge together)
  cols_to_double <- setdiff(names(bas_data), c("tsid", "bas_version", "id"))
  bas_data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
  
  # Finding which businesses were companies in each quarter
  company_q1_ids <- fread(paste0("R:/blade-standard/Indicative Data Items (EUM)/CSV/indicative_id_q1_", year, ".csv"))
  company_q1_ids <- company_q1_ids[x_tolo %in% c(1, 2, 3, 4, 5)]
  company_q1_ids <- company_q1_ids[, .(id)]
  
  company_q2_ids <- fread(paste0("R:/blade-standard/Indicative Data Items (EUM)/CSV/indicative_id_q2_", year, ".csv"))
  company_q2_ids <- company_q2_ids[x_tolo %in% c(1, 2, 3, 4, 5)]
  company_q2_ids <- company_q2_ids[, .(id)]
  
  company_q3_ids <- fread(paste0("R:/blade-standard/Indicative Data Items (EUM)/CSV/indicative_id_q3_", year, ".csv"))
  company_q3_ids <- company_q3_ids[x_tolo %in% c(1, 2, 3, 4, 5)]
  company_q3_ids <- company_q3_ids[, .(id)]
  
  company_q4_ids <- fread(paste0("R:/blade-standard/Indicative Data Items (EUM)/CSV/indicative_id_q4_", year, ".csv"))
  company_q4_ids <- company_q4_ids[x_tolo %in% c(1, 2, 3, 4, 5)]
  company_q4_ids <- company_q4_ids[, .(id)]
  
  # Only including businesses which were companies for the whole year
  company_year_ids <- merge(company_q1_ids, company_q2_ids, by = "id")
  company_year_ids <- merge(company_year_ids, company_q3_ids, by = "id")
  company_year_ids <- merge(company_year_ids, company_q4_ids, by = "id")
  
  company_data <- merge(bas_data, company_year_ids, by = "id")
  
  bas_eum_company_data[[year]] <- company_data
}

bas_eum_company_data <- rbindlist(bas_eum_company_data, use.names = TRUE, fill = TRUE)

write_parquet(bas_eum_company_data, "P:/Company Tax/Data/bas_eum_company_data.parquet")

# Collecting PAYG data
payg_data <- list()

for(year in years){
  print(paste0("Processing: ", year)) # See processing progress
  data <- fread(paste0("R:/blade-standard/30 June 2023 (Longitudinal EUM)/PAYG/CSV/blade2223_payg_id_", year, ".csv"))
  payg_data[[year]] <- data
}

payg_data <- bind_rows(payg_data) 

write_parquet(payg_data, "P:/Company Tax/Data/payg_eum_data.parquet")

# ------------------------------------------------------------------------------
# Collecting Business Income Taxation (BIT) Data
# Not all variables are available in every year so a series of if statements is used to pull variables that are present in given years

bit_comp_data <- list()

for(year in years){
  print(paste0("Processing: ", year)) # See processing progress
  data <- fread(paste0("R:/blade-standard/30 June 2023 (Longitudinal EUM)/BIT/CSV/blade_2223_bit_id_", year, ".csv"))
  
  if ("c_subt5" %in% names(data)){
    setnames(data, old = "c_subt5", new = "tax_payable")
  } else if ("c_taxassd" %in% names(data)){ # variable name switches in 2011-12
    setnames(data, old = "c_taxassd", new = "tax_payable")
  }
  
  if(as.numeric(substr(year, 3, 4)) < 12){ # These variables are available for all years
    
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", 
                        "c_divufran", "c_totlinc", "c_uga", "c_taxinc", "c_frrswinc", "c_intdepas", "c_othdepas",
                        "c_gpnabn", "c_sales", "c_grosintr", "c_deprexps", "c_franbalc",
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab")
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran, 
                     c_totlinc, c_uga, c_taxinc, c_frrswinc, c_gpnabn,
                     c_sales, c_grosintr, c_deprexps, c_franbalc, c_intdepas, c_othdepas,
                     c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)] # select variables,
    
  } else if (as.numeric(substr(year, 3, 4)) == 12){
    
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", 
                        "c_divufran", "c_totlinc", "c_uga", "c_taxinc", "c_taxti", "c_frrswinc", "c_intdepas", "c_othdepas",
                        "c_gpnabn", "c_sales", "c_grosintr", "c_deprexps", "c_franbalc", 
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab")
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran, 
                     c_totlinc, c_uga, c_taxinc, c_taxti, c_frrswinc, c_gpnabn, 
                     c_sales, c_grosintr, c_deprexps, c_franbalc, c_intdepas, c_othdepas,
                     c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)] # select variables,
    
  } else if (as.numeric(substr(year, 3, 4)) >= 13 & as.numeric(substr(year, 3, 4)) <= 15) { # Total debt is only available after FY 1213, TOFA expenses available FY 1213-1415
    
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", 
                        "c_divufran", "c_totlinc", "c_uga", "c_taxinc", "c_expnss_fincl_arngmts_tofa_amt", 
                        "c_taxti", "c_totl_debt_amt", "c_frrswinc", "c_intdepas", "c_othdepas", "c_grosintr",
                        "c_gpnabn", "c_sales", "c_deprexps", "c_franbalc", 
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab") # ADD IN TOTAL DEBT and TOFA Expenses
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos,c_divfrank, c_divufran, 
                     c_totlinc, c_uga, c_taxinc, c_totl_debt_amt, c_expnss_fincl_arngmts_tofa_amt, c_taxti, 
                     c_frrswinc, c_gpnabn, c_sales, c_grosintr, c_deprexps, c_franbalc, c_intdepas, c_othdepas,
                     c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)] 
    
  } else if (as.numeric(substr(year, 3, 4)) > 15 & as.numeric(substr(year, 3, 4)) < 18) { 
    # Most variables, now mainly excluding offsets and TOFA
    
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", "c_divufran", 
                        "c_uga", "c_taxinc", "c_totl_debt_amt", "c_taxti", "c_totlinc", "c_frrswinc", "c_intdepas", "c_othdepas",
                        "c_gpnabn", "c_sales", "c_grosintr", "c_deprexps", "c_franbalc", 
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab")
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran,
                     c_taxinc, c_uga, c_taxti, c_totlinc, c_totl_debt_amt, c_frrswinc, c_gpnabn, 
                     c_sales, c_grosintr, c_deprexps, c_franbalc, c_intdepas, c_othdepas,  
                     c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)]
    
  } else if (as.numeric(substr(year, 3, 4)) >= 18 & as.numeric(substr(year, 3, 4)) < 21) { 
    # Most variables, now mainly excluding offsets and TOFA
    
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", "c_divufran", 
                        "c_uga", "c_taxinc", "c_totl_debt_amt", "c_taxti", "c_totlinc",
                        "c_intdepas", "c_othdepas", "c_franbalc",  
                        "c_sales", "c_grosintr", "c_deprexps", # only available after 2018
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab") 
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran,
                     c_taxinc, c_uga, c_sales, 
                     c_taxti, c_totlinc, c_totl_debt_amt,
                     c_grosintr, c_deprexps, c_franbalc, c_intdepas, c_othdepas,
                     c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)]
    
  } else if (as.numeric(substr(year, 3, 4)) == 21) { # IAWO AND TFE
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", "c_divufran", 
                        "c_uga", "c_taxinc", "c_totl_debt_amt", "c_taxti", "c_totlinc",
                        "c_intdepas", "c_othdepas", "c_franbalc", "c_sales", "c_grosintr", "c_deprexps", # only available after 2018
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab") 
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran,
                     c_taxinc, c_uga, c_sales, c_taxti, c_totlinc, c_totl_debt_amt, c_grosintr, c_deprexps, 
                     c_franbalc, c_intdepas, c_othdepas, c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)]
    
  } else if (as.numeric(substr(year, 3, 4)) >= 22) { # Adding in TFE
    cols_to_double <- c("tax_payable", "c_totlasst", "c_taxicalc", "c_toprolos", "c_divfrank", "c_divufran", 
                        "c_uga", "c_taxinc", "c_totl_debt_amt", "c_taxti", "c_totlinc",
                        "c_intdepas", "c_othdepas",
                        "c_sales", "c_grosintr", "c_deprexps", # only available after 2018
                        "c_royexpos", "c_royexpau", "c_intexpos", "c_intexpau", "c_currasst", "c_currliab", "c_totlliab") 
    data[, (cols_to_double) := lapply(.SD, as.double), .SDcols = cols_to_double]
    data <- data[, .(tsid, id, tax_payable, c_totlasst, c_taxicalc, c_toprolos, c_divfrank, c_divufran,
                     c_taxinc, c_uga, c_sales, c_taxti, c_totlinc, c_totl_debt_amt,
                     c_grosintr, c_deprexps, c_intdepas, c_othdepas, c_royexpos, c_royexpau, c_intexpos, c_intexpau, c_currasst, c_currliab, c_totlliab)]
  }
  bit_comp_data[[year]] <- data
  }

bit_comp_data <- rbindlist(bit_comp_data, fill = TRUE)

write_parquet(bit_comp_data, "P:/Company Tax/Data/bit_comp_eum_data.parquet")

# Collecting Industry Data
CSI_data <- list()

for(year in years){
  print(year) 
  CSI <- fread(paste0("R:/blade-standard/30 June 2023 (Longitudinal EUM)/Indicative Data Items/CSV/blade2223_indicative_id_", year, ".csv"))
  CSI <- CSI[, .(id, tsid, d_div06)]
  
  CSI_data[[year]] <- CSI
  }

CSI_data <- rbindlist(CSI_data, use.names = TRUE) 

write_parquet(CSI_data, "P:/Company Tax/Data/EUM/CSI_Industry_EUM_data.parquet")

# Keys
keys_yearly_data <- list()

for(year in years){
  print(year) 
  data <- fread(paste0("R:/blade-standard/30 June 2023 (Longitudinal EUM)/ID to BN Key/CSV/blade2223_id_bn_key_", year, ".csv"))
  
  keys_yearly_data[[year]] <- data
}

keys_yearly_data <- rbindlist(keys_yearly_data, use.names = TRUE) 

write_parquet(keys_yearly_data, "P:/Company Tax/Data/EUM/Keys_EUM_data.parquet")





