#R Code for Asymmetric Fixed Effects Models for Panel Data
#Paul D. Allison
#31 December 2018

#Libraries
library(haven)
library(magrittr)
library(clubSandwich)
library(car)
library(plm)
library(tidyverse)
library(nlme)
library(survival)
#Section 2
nlsy<-read_dta(url("https://statisticalhorizons.com/wp-content/uploads/nlsy.dta"))
nlsy$antidiff<-nlsy$anti92-nlsy$anti90
nlsy$selfdiff<-nlsy$self92-nlsy$self90
nlsy$povdiff<-nlsy$pov92-nlsy$pov90
#Table 1
table(nlsy$povdiff)
#Figure 1
hist(nlsy$selfdiff)
#Figure 2
hist(nlsy$antidiff)
nlsy$selfpos<- nlsy$selfdiff*(nlsy$selfdiff>0)
nlsy$selfneg<- -nlsy$selfdiff*(nlsy$selfdiff<0)
nlsy$povpos<- nlsy$povdiff*(nlsy$povdiff>0)
nlsy$povneg<- -nlsy$povdiff*(nlsy$povdiff<0)
#Table 2
ols<-lm(antidiff ~ selfdiff + povdiff, data = nlsy)
summary(ols)

#Section 3
#Table 3
ols3<-lm(antidiff ~ selfpos + selfneg +povpos + povneg, data = nlsy)
summary(ols3)
lht(ols3,"selfpos=-selfneg")

#Section 4
#Read wage data from the web in wide form:
wage<-read_dta(url("https://statisticalhorizons.com/wp-content/uploads/wagerate.dta"))

#Reshape to long form:
wage$id <- seq(1:nrow(wage)) # add id value
data.table::setDT(wage)
wage_long <- data.table::melt(wage, 
                   measure = patterns("^lwage","^mar","^edu","urb"), 
                   variable.name = "time",
                   value.name = c("wage","mar", "edu", "urb"))
wage_long %<>% arrange(id, time)

#Table 4
fe.plm <- plm(wage ~ mar + edu + urb + time,
              index = "id", model = "within", data = wage_long)
summary(fe.plm)
coef_test(fe.plm, vcov = "CR1S", cluster = wage_long$id, test = "naive-t")


#Create difference scores and decompose them
wage_long2 <- wage_long %>%
  group_by(id) %>%
  mutate(wagediff = wage-lag(wage), mardiff = mar-lag(mar), 
         edudiff = edu-lag(edu), urbdiff=urb-lag(urb)) %>% # Create first-difference scores for all variables
  mutate(marpos = mardiff*(mardiff>0), marneg = -mardiff*(mardiff<0), 
         urbpos = urbdiff*(urbdiff>0), urbneg = -urbdiff*(urbdiff<0))# Decompose difference scores into positive and negative components
wage_long2<-as.data.frame(wage_long2) 

#Table 5, left panel
wageols<-lm(wagediff ~ mardiff + edudiff +urbdiff + time, data = wage_long2)
summary(wageols)
coef_test(wageols, 
          vcov = "CR1S", cluster = wage_long2$id, test = "naive-t")

#Table 5, right panel
wage.fe <- plm(wagediff ~ mardiff + edudiff + urbdiff + time,
               index = "id", model = "within", data = wage_long2)
summary(wage.fe)
coef_test(wage.fe, vcov = "CR1S", cluster = wage_long2$id, test = "naive-t")

#Table 6, left panel
#Delete cases from time 1 with missing difference scores (gls won't tolerate NAs):
wage_long3<-na.omit(wage_long2)
#Then subtract 1 from time factor so it has values 1,2,3,4.  
wage_long3$time<-as.factor(as.numeric(as.character(wage_long3$time))-1)
gls.uns <- gls(wagediff ~ mardiff + edudiff + urbdiff + time,
               corr = corSymm(form = ~ 1 | id),       # unstructured covariance
               weights = varIdent(form = ~ 1 | time), # unstructured variance 
               data = wage_long3,
               method = "ML")
summary(gls.uns)
coef_test(gls.uns, vcov = "CR1S", cluster = wage_long3$id, test = "naive-t")

#Table 6, right panel
gls.toe2 <- gls(wagediff ~ mardiff + edudiff + urbdiff + time,
                corr = corARMA(form = ~ 1 | id, p = 0, q = 1),  
                data = wage_long3,
                method = "ML")
summary(gls.toe2)
coef_test(gls.toe2, vcov = "CR1S", cluster = wage_long3$id, test = "naive-t")

#Fully constrained GLS
gls.con <- gls(wagediff ~ mardiff + edudiff + urbdiff + time,
               corr = corARMA(value=-.9999,form = ~ 1 | id, p = 0, q = 1, fixed=T),  
               data = wage_long3, method = "ML")
summary(gls.con)
coef_test(gls.con, vcov = "CR1S", cluster = wage_long3$id, test = "naive-t")

#Section 5
#Table 7
table(wage_long3$urbdiff)
table(wage_long3$mardiff)

#Table 8
gls.con2 <- gls(wagediff ~ marpos + marneg + edudiff + urbpos + urbneg + time,
                corr = corARMA(value=-.9999,form = ~ 1 | id, p = 0, q = 1, fixed=T),  
                data = wage_long3, method = "ML")
summary(gls.con2)
coef_test(gls.con2, vcov = "CR1S", cluster = wage_long3$id, test = "naive-t")
lht(gls.con2,"urbpos=-urbneg")
lht(gls.con2,"marpos=-marneg")


#Section 6

#Read 5-period data from the web in wide form:
wage<-read_dta(url("https://statisticalhorizons.com/wp-content/uploads/wagerate.dta"))
wage$id <- seq(1:nrow(wage)) # add id variable

#Convert to long form:
wage.long<- wage %>%
  gather(Key, Value, -id, -male, -i) %>%
  separate(Key, into=c("var", "time"), sep = "(?<=[A-Za-z])(?=[0-9])") %>%
  spread(var, Value)

#Create difference scores, decompose them, and cumulate them:
wage.long2 <- wage.long %>%
  group_by(id) %>%
  mutate(wagediff = lwage-lag(lwage), mardiff = mar-lag(mar), 
         edudiff = edu-lag(edu), urbdiff = urb-lag(urb)) %>% # Create first-difference scores for all variables
  mutate(marpos = mardiff*(mardiff>0), marneg = -mardiff*(mardiff<0),
         urbpos = urbdiff*(urbdiff>0), urbneg = -urbdiff*(urbdiff<0)) %>% # Decompose difference scores into positive and negative components
  replace(is.na(.), 0) %>% # ATTN: this makes all missing values zero, not just the NAs from t=1.
  mutate(marcumpos = cumsum(marpos), marcumneg = cumsum(marneg), 
         urbcumpos = cumsum(urbpos), urbcumneg = cumsum(urbneg)) #Generate cumulative values of the difference scores:
wage.long2<-as.data.frame(wage.long2)

#Estimate fixed effects model:
wage.fe <- plm(lwage ~ marcumpos + marcumneg + edu + urbcumpos + urbcumneg + time, index = "id", model = "within", data = wage.long2)
summary(wage.fe)
coef_test(wage.fe, vcov = "CR1S", cluster = wage.long2$id,  test = "z")
lht(wage.fe,"urbcumpos=-urbcumneg")
lht(wage.fe,"marcumpos=-marcumneg")


#Section 7
#Read 5-period data from the web in wide form:
teenpov<-read_dta(url("https://statisticalhorizons.com/wp-content/uploads/teenpov.dta"))
#Convert data to long form:
teenpov.long<- teenpov %>%
  gather(Key, Value, -id, -black,-age) %>%
  separate(Key, into=c("var", "time"), sep = "(?<=[A-Za-z])(?=[0-9])") %>%
  spread(var, Value)

#Table 9, Estimate symmetric fixed effects
teenpov.fe<- clogit(pov~mother+spouse+inschool+hours+time+strata(id),
                    data=teenpov.long)
summary(teenpov.fe)

#Create variables for asymmetric model
teenpov.long2 <- teenpov.long %>%
  group_by(id) %>%
  mutate(spousediff = spouse-lag(spouse), schooldiff = inschool-lag(inschool), 
         hoursdiff = hours-lag(hours)) %>% # Create first-difference scores for all variables
  mutate(spousepos = spousediff*(spousediff>0), spouseneg = -spousediff*(spousediff<0), 
         schoolpos = schooldiff*(schooldiff>0), schoolneg = -schooldiff*(schooldiff<0),
         hourspos=hoursdiff*(hoursdiff>0),hoursneg=-hoursdiff*(hoursdiff<0)) %>% # Decompose difference scores into positive and negative components
  replace(is.na(.), 0) %>% # ATTN: this makes all missing values zero, not just the NAs from t=1.
  mutate(spousecumpos = cumsum(spousepos), spousecumneg = cumsum(spouseneg), 
         schoolcumpos = cumsum(schoolpos), schoolcumneg = cumsum(schoolneg),
         hourscumpos = cumsum(hourspos),   hourscumneg = cumsum(hoursneg)) #Generate cumulative values of the difference scores:

#Table 10 - Estimate asymmetric fixed effects model
teenpov.fe2<- clogit(pov~mother+spousecumpos+spousecumneg+schoolcumpos+schoolcumneg+
                       hourscumpos+hourscumneg+time+strata(id), data=teenpov.long2)
summary(teenpov.fe2)
lht(teenpov.fe2,"spousecumpos=-spousecumneg")
lht(teenpov.fe2,"schoolcumpos=-schoolcumneg")
lht(teenpov.fe2,"hourscumpos=-hourscumneg")