#Load packages
install.packages("tidyverse", "data.table","readr","ggplot2","scales","arrow")
library(tidyverse)
library(data.table)
library(readr)
library(ggplot2)
library(scales)
library(arrow)

options(scipen = 999)

# Import ----
# Import BAS
bas_company_yearly_data <- fread("P:/Company Tax/Data/bas_company_yearly_data.csv") 
bas_company_yearly_data <- bas_company_yearly_data[, .(bn, tsid, turnover)] # select variables

# Import BIT
bit_comp_data <- fread("P:/Company Tax/Data/bit_company_data.csv")
bit_comp_data <- bit_comp_data[, .(bn, tsid, c_base_rt_bus_cd, c_totlinc)] # select variables

# Import business group keys
keys <- read_parquet("P:/Company Tax/Data/EUM/Keys_EUM_data.parquet")
keys <- keys[, .(bn, bg_id, tsid)] # select variables

# Merge
master_data <- merge(bas_company_yearly_data, bit_comp_data, by = c("bn", "tsid"))
master_data <- merge(master_data, keys, by = c("bn", "tsid"))

# SET business group (BG) equal to BN where BG is NA
Bunching <- master_data[, bg_id := fifelse(bg_id == "", bn, bg_id)] 

Bunching <- Bunching[, .(
  total_income = sum(c_totlinc, na.rm = TRUE)
  ), by = .(bg_id, tsid)]

#Analysis ----
#generate standard bins for easy analysis and charting
#0-25m - All
breaks <- seq(0, ceiling(30000000), by = 100000)

Bins_2_25m1 <- Bunching %>% 
  group_by(tsid) %>% 
  mutate(
    total_income_bin = cut(
      total_income,
      breaks = breaks,
      right = FALSE,
      include.lowest = TRUE,
      labels = FALSE),
    total_income_bin = (total_income_bin - 1) * 100000 #bottom-end of bin
  )

Bins_2_25m_all <- Bins_2_25m1 %>% 
  count(total_income_bin, name = "n") %>%
  mutate(n = ifelse(n < 10, NA, n)) %>% 
  filter(!is.na(total_income_bin))

write.csv(Bins_2_25m_all, "O:/Company Tax/Metrics and Bunching Code Clearance - QA/Bunching Analysis/Bunching 2-25_all BIT.csv")

breaks <- seq(0, ceiling(60000000), by = 1000000)

Bins_0_50m <- Bunching %>% 
  group_by(tsid) %>% 
  mutate(
    total_income_bin = cut(
      total_income,
      breaks = breaks,
      right = FALSE,
      include.lowest = TRUE,
      labels = FALSE
    ),
    total_income_bin = (total_income_bin - 1) * 1000000 #bottom-end of bin
  )

Bins_0_50m_all <- Bins_0_50m %>% 
  count(total_income_bin, name = "n") %>%
  mutate(n = ifelse(n < 10, NA, n)) %>% 
  filter(!is.na(total_income_bin))

write.csv(Bins_0_50m_all, "O:/Company Tax/Metrics and Bunching Code Clearance - QA/Bunching Analysis/Bunching 0-50m BIT base.csv")

  
  