rm(list=ls())

# Set seed for replications purposes
set.seed(2024)

result <- as.data.frame(matrix(NA, nrow=1000, ncol=3))
names(result) <- c("MaxDupl", "Correct", "FalsePos")

for (i in 1:1000) 
{

# ------ Simulation of dataset ------
# Source: whuber (2018), Generating correlated binomial random variables. Accessed last on 02.04.2023 from https://stats.stackexchange.com/q/285008

# Compute Pr(0,0) from rho, p=Pr(X=1), and q=Pr(Y=1).
a <- function(rho, p, q) {
  rho * sqrt(p*q*(1-p)*(1-q)) + (1-p)*(1-q)
}

# Specify number of observations and variables in dataset (prior to adding duplicates)
n.sim <- 10500 # Number of observations
n.var <- 11  # Number of variables
data <- matrix(NA, nrow=n.sim, ncol=n.var)

# Specify parameters for the FIRST two variables (replicates "state of birth" and "state of employment")
n.options <- 50 # Number of variable categories or 'answer options'
p <- 1/3
q <- 1/4
rho <- 0.4985

# Compute the four probabilities for the joint distribution.
a.0 <- a(rho, p, q)
prob <- c(`(0,0)`=a.0, `(1,0)`=1-q-a.0, `(0,1)`=1-p-a.0, `(1,1)`=a.0+p+q-1)
if (min(prob) < 0) {
  print(prob)
  stop("Error: a probability is negative.")
}

# Generation of correlated Binomial variables.
round <- 1 # Reflecting the first, second, ... set of two variables

u <- sample.int(4, n.sim * n.options, replace=TRUE, prob=prob)
y <- floor((u-1)/2)
x <- 1 - u %% 2
data[,round*2-1] <- colSums(matrix(x, nrow=n.options)) # Sum in groups of `n`
data[,round*2] <- colSums(matrix(y, nrow=n.options)) # Sum in groups of `n`


# Specify parameters for the SECOND two variables (replicates "gender" and "race")
n.options <- 2 # Number of variable categories or 'answer options'
p <- 1/3
q <- 1/4
rho <- 0.0536

# Compute the four probabilities for the joint distribution.
a.0 <- a(rho, p, q)
prob <- c(`(0,0)`=a.0, `(1,0)`=1-q-a.0, `(0,1)`=1-p-a.0, `(1,1)`=a.0+p+q-1)
if (min(prob) < 0) {
  print(prob)
  stop("Error: a probability is negative.")
}

# Generation of correlated Binomial variables.
round <- 2 # variable reflecting the first, second, ... set of two variables

u <- sample.int(4, n.sim * n.options, replace=TRUE, prob=prob)
y <- floor((u-1)/2)
x <- 1 - u %% 2
data[,round*2-1] <- colSums(matrix(x, nrow=n.options)) # Sum in groups of `n`
data[,round*2] <- colSums(matrix(y, nrow=n.options)) # Sum in groups of `n`


# Specify parameters for the THIRD two variables (replicates "Employment Father" and "Employment Mother")
n.options <- 4 # Number of variable categories or 'answer options'
p <- 1/3
q <- 1/4
rho <- 0.2765

# Compute the four probabilities for the joint distribution.
a.0 <- a(rho, p, q)
prob <- c(`(0,0)`=a.0, `(1,0)`=1-q-a.0, `(0,1)`=1-p-a.0, `(1,1)`=a.0+p+q-1)
if (min(prob) < 0) {
  print(prob)
  stop("Error: a probability is negative.")
}

# Generation of correlated Binomial variables.
round <- 3 # variable reflecting the first, second, ... set of two variables

u <- sample.int(4, n.sim * n.options, replace=TRUE, prob=prob)
y <- floor((u-1)/2)
x <- 1 - u %% 2
data[,round*2-1] <- colSums(matrix(x, nrow=n.options)) # Sum in groups of `n`
data[,round*2] <- colSums(matrix(y, nrow=n.options)) # Sum in groups of `n`


# Specify parameters for the FOURTH two variables (replicates "Education Father" and "Education Mother")
n.options <- 5 # Number of variable categories or 'answer options'
p <- 1/3
q <- 1/4
rho <- 0.5493

# Compute the four probabilities for the joint distribution.
a.0 <- a(rho, p, q)
prob <- c(`(0,0)`=a.0, `(1,0)`=1-q-a.0, `(0,1)`=1-p-a.0, `(1,1)`=a.0+p+q-1)
if (min(prob) < 0) {
  print(prob)
  stop("Error: a probability is negative.")
}

# Generation of correlated Binomial variables.
round <- 4 # variable reflecting the first, second, ... set of two variables

u <- sample.int(4, n.sim * n.options, replace=TRUE, prob=prob)
y <- floor((u-1)/2)
x <- 1 - u %% 2
data[,round*2-1] <- colSums(matrix(x, nrow=n.options)) # Sum in groups of `n`
data[,round*2] <- colSums(matrix(y, nrow=n.options)) # Sum in groups of `n`


# Specify parameters for the FIFTH two variables (replicates "Age" and "Length in agency")
n.options <- 45 # Number of variable categories or 'answer options'
p <- 1/3
q <- 1/4
rho <- 0.3589

# Compute the four probabilities for the joint distribution.
a.0 <- a(rho, p, q)
prob <- c(`(0,0)`=a.0, `(1,0)`=1-q-a.0, `(0,1)`=1-p-a.0, `(1,1)`=a.0+p+q-1)
if (min(prob) < 0) {
  print(prob)
  stop("Error: a probability is negative.")
}

# Generation of correlated Binomial variables.
round <- 5 # variable reflecting the first, second, ... set of two variables

u <- sample.int(4, n.sim * n.options, replace=TRUE, prob=prob)
y <- floor((u-1)/2)
x <- 1 - u %% 2
data[,round*2-1] <- colSums(matrix(x, nrow=n.options)) # Sum in groups of `n`
data[,round*2] <- colSums(matrix(y, nrow=n.options)) # Sum in groups of `n`


# Add a final random variable  (replicates "education")
n.options <- 5 # Number of variable categories or 'answer options'
data[, n.var] <- sample(1:n.options, n.sim, replace = TRUE)

data <- as.data.frame(data)
cor(data)


# ------ Duplicate 950 observations and split sample over two "time periods" ------

Matched <- sample(1:n.sim, 950, replace=FALSE) # Randomly select 950 observations as matches
data$time <- 0
data$match <- 0
data.temp1 <- data[-Matched,]
temp <- sample(1:nrow(data.temp1), nrow(data.temp1)/2, replace=FALSE)
data.temp1$time[temp] <- 1
data.temp2 <- data[Matched,]
data.temp2$time <- 1
data.temp2$match <- 1
data.temp3 <- data[Matched,]
data.temp3$match <- 1

data <- rbind(data.temp1,data.temp2,data.temp3)
#summary(data)

# ------ Apply matching methodology ------

# Step 1: Set up function that amalgamates and filters all at once

panelmatch <- function(data, vars, at_least) {
  
  store <- paste0(paste(vars, collapse=", "))
  
  data %>%
    group_by(!!!syms(vars)) %>%
    dplyr::mutate(FULLSET = cur_group_id()) %>%
    group_by(FULLSET) %>%
    arrange(FULLSET, time) %>%
    filter(n() > (at_least - 1)) %>% # what frequency do we want to select
    mutate(count = n()) %>% # record how many times it occurs 
    ungroup()
}

## Instructions: 
# data = name of the dataset 
# vars = a character vector of the variables you want to match on
# at_least = the minimum number of matches you want to filter on. 
#            e.g. at_least = 2 means filter for where DUP occurs at least twice

library(dplyr)
characteristics <- c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11")

Step4 <- panelmatch(data=data, vars=characteristics, at_least=2)
Step4 <- Step4[order(-Step4$count,Step4$FULLSET,Step4$time), ]

# ------ Calculate key characteristics of matched dataset ------
#max(Step4$count) # Maximum duplicates
#nrow(Step4[(Step4$match == 1 & Step4$count == 2), ]) / 2 # Correct matches
#nrow(Step4[(Step4$match == 0 & Step4$count == 2), ]) / 2 # False positives

result[i,1] <- max(Step4$count)
result[i,2] <- nrow(Step4[(Step4$match == 1 & Step4$count == 2), ]) / 2
result[i,3] <- nrow(Step4[(Step4$match == 0 & Step4$count == 2), ]) / 2

}

par(mfrow = c(1, 2))
hist(result[,2], main = "Correct Matches", xlab = "Number of correct matches", freq=TRUE, ylim = c(0,1000), breaks = c(940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950))
hist(result[,3], main = "False Positives", xlab = "Number of false positives", freq=TRUE, ylim = c(0,1000), breaks = c(0, 0.99, 1.99, 2.99, 3.99, 4.99, 5.99, 6.99, 7.99, 8.99, 9.99))
par(mfrow = c(1, 1))

table(result[,2])
table(result[,3])