

library(tidyverse)
library(data.table)
library(ggplot2)
library(tictoc)
library(fixest)
library(arrow)

# Import data ----
DnD_master_data <- read_parquet("P:/Company Tax/Data/EUM/DnD_eum_master_data.parquet")

#Filter NAs
DnD_master_data <- DnD_master_data[!is.na(fte) & 
                                     !is.na(d_div06) & 
                                     !is.na(c_totlinc) &
                                     !is.na(c_currasst) &
                                     !is.na(c_currliab)]

# Cleaning data ----
# Selecting Variables we Want
DnD_master_data <- DnD_master_data[, .(id, tsid, capex, fte, d_div06, turnover, c_totlinc, c_currasst, c_currliab, company_flag)]

# Import ID keys
bn_id_keys <- read_parquet("P:/Company Tax/Data/EUM/Keys_EUM_data.parquet")
bn_id_keys <- bn_id_keys[, .(bn, id, bg_id, tsid)]

# Merge datasets
merged_data <- left_join(DnD_master_data, bn_id_keys, by = c("tsid", "id"))

# SET business group (BG) equal to BN where BG is NA
merged_data <- merged_data[, bg_id := fifelse(bg_id == "", bn, bg_id)]

# For companies belonging the same BG, we sum the relevant variables
summed_data <- merged_data[, .(
  turnover = sum(turnover, na.rm = TRUE),
  fte = sum(fte, na.rm = TRUE),
  capex = sum(capex, na.rm = TRUE),
  total_comp_income = sum(c_totlinc, na.rm = TRUE),
  c_currasst = sum(c_currasst, na.rm = TRUE),
  c_currliab = sum(c_currliab, na.rm = TRUE)
  ), by = .(bg_id, tsid)]

# Creating industry variables to account for some business groups (bg) operating in multiple industries
categorical_data <- merged_data[, .(
  industry_simple = if (uniqueN(d_div06) == 1) d_div06[1] else "Multiple", # If a bg_id operates in multiple industries in a given year, the industry value is "Multiple"
  industry_precise = if (uniqueN(d_div06) == 1) d_div06[1] else paste(sort(unique(d_div06)), collapse = "") # If a bg_id operates in industry A and industry B in a given year, the new industry value is "AB"
  ),
  by = .(bg_id, tsid)]

# Merging back industry variables since they cannot be summed
data <- full_join(summed_data, categorical_data, by = c("bg_id", "tsid"))

# Bringing in Overseas and company status variables from the Business Income Tax (BIT) dataset
bit_comp_data <- fread("P:/Company Tax/Data/bit_company_data.csv")
bit_comp_data <- bit_comp_data[, .(bn, tsid, c_frgnshre, c_uhccntcd, c_bm_sgmt_cd, c_base_rt_bus_cd)]
bit_comp_data <- bit_comp_data[, `:=`(
  Overseas = fifelse(c_uhccntcd == 15, 0, 1), # A country is categorised as Overseas if its country code is not 15 (which is Australia's)
  foreign_flag = fifelse(is.na(c_frgnshre) | c_frgnshre == 0, 0, 1), # The foreign share variable is left blank (NA) if foreign percentange of foreign shareholding is less than 10% - hence we define the flag as equal to 1 if the variable is above 10% or zero (signficant)
  small_bus = fifelse(is.na(c_bm_sgmt_cd), 0, 1), # If this variable is blank (NA), it is not a small business entity
  base_rate_entity = fifelse(is.na(c_base_rt_bus_cd), 0, 1))] # If this variable is blank (NA), it is not a base rate entity

bit_comp_data <- bit_comp_data[, .(bn, tsid, foreign_flag, Overseas, small_bus, base_rate_entity)] # selecting variables

# Merging business group ids with BIT variables
bit_flag_ids <- left_join(bit_comp_data, bn_id_keys, by = c("bn", "tsid"))
bit_flag_ids <- bit_flag_ids[, bg_id := fifelse(bg_id == "", bn, bg_id)]
bit_flag_ids <- unique(bit_flag_ids, by = c("bg_id", "tsid")) # Removing duplicates

data <- merge(data, bit_flag_ids, by = c("bg_id", "tsid"))

# Generate variable for business age as a control
birthdate_data <- fread("R:/blade-standard/Birthdate (ABN)/CSV/birthdate_bn.csv")
birthdate_data <- inner_join(birthdate_data, bn_id_keys, by = "bn")
birthdate_data <- birthdate_data[, bg_id := fifelse(bg_id == "", bn, bg_id)]
birthdate_data <- birthdate_data[, age := 2000 + tsid - as.numeric(substr(birth_date, 1, 4)) + 1] # age is financial year (tsid) minus birth financial year
birthdate_data <- birthdate_data[, .(age = max(age, na.rm = TRUE)), by = .(tsid, bg_id)] # if a business group owns multiple businesses, we set its age to the age of its oldest business

data <- merge(data, birthdate_data, by = c("bg_id", "tsid"))

# Calculate capex log
data <- data[capex != 1] # If capex is 1, log is zero; also an unlikely value for capex, so remove 
data[, capex_log := ifelse(capex == 0, 0, log(capex))] # log investment, preventing negative investment

# Factor Variables for FE interactions
data[, tsid_factor := as.factor(tsid)]

# Controls
## Liquidity
data[, liquidity := c_currasst / c_currliab]
data <- data[!(c_currliab == 0 & c_currliab == 0)]  # remove cases where c_currasst = c_currliab = 0
data[, min_lqdty := min(liquidity[liquidity > 0], na.rm = TRUE), by = tsid] # obtain the smallest value for liquidity that is greater than zero in each financial year
data[, log_lqdty := fifelse(liquidity == 0, log(min_lqdty / (1+min_lqdty)), log(liquidity))] # if liquidity is zero, log liquidity is smaller than the smallest value of log liquidity

## Income
data <- data[total_comp_income >= 0]# Remove negative total income (only 9)
data[, min_tot_inc := min(total_comp_income[total_comp_income > 0], na.rm = TRUE), by = tsid]
data[, log_tot_inc := fifelse(total_comp_income == 0, log(min_tot_inc / (1+min_tot_inc)), log(total_comp_income))] # if income is zero, log income is slightly smaller than the smallest value of log income

## FTE
data[, min_fte := min(fte[fte > 0], na.rm = TRUE), by = tsid]
data[, fte_log := ifelse(fte == 0, log(min_fte / (1+min_fte)), log(fte))] # if fte is zero, log fte is slightly smaller than the smallest value of log fte

# Flags
data[, invest_dummy := ifelse(capex > 0, 1, 0)] # Dummy for if a firm invests
data[, lqdty_flag := ifelse(liquidity < 1, 1, 0)]

# Take out Mining
data <- data[!grepl("B", industry_precise)]

# Dynamic Period Variable
data <- data[, period := fcase(
  tsid == 12, 12, 
  tsid == 13, 13,
  tsid == 14, 14,
  tsid == 15, 15,
  tsid == 16, 16,
  tsid == 17, 17,
  tsid == 18, 18,
  tsid == 19, 19
)]

write_csv(data, "P:/Company Tax/Data/EUM/bas_eum_reg_data.csv")
