####################################################### ######### SOEP Practice Data ################## ####################################################### # -> There is full SOEP # -> There is SOEP teaching data (50%) # -> And there is a "practice data set" # https://www.diw.de/en/diw_02.c.222838.en/soep_in_the_college_classroom.html # This dataset in STATA format is based on the original SOEP data, # but provides them in a significantly altered and fully anonymous form, # which allows the practice dataset to be used independently # of data distribution contracts and user agreements. # The practice dataset consists of a total of 26 original variables # and 12,922 measurements, covers five time points, and is available in "SOEPlong" format. ####################################################### ######### Load data #################### ####################################################### # Adjust to your working directory. Create a folder for R scripts and data somewhere. # Attention: windows path "C:\YOURPATH" will not be recognized. # Instead you have to add an additional backslash or use slashes. getwd() setwd("C:\\Users\\Marco2014\\Dropbox\\Viadrina\\2020 SS\\Seminar in Applied Economics\\Projects") # Load data library(haven) soep <- read_dta("soep_lebensz_en.dta") # There are two panel identifier: person id and year ####################################################### ######### Base R Analysis n ################## ####################################################### # With View() you can inspect and scroll tabular data View(soep) # Show me some of the first entries in the dataset head(soep) # We can get descriptive statistics for all variables summary(soep) # For which variables does mean calculation makes no sense? # Which variable contain missing values ("NA)? # Count the number of fe/males # Does 0 refer to males or females? table(soep$sex) # 0 1 # 5958 6964 # Remember what panel data is? Certainly we counted people multiple times. # Let's connect commands, show me the first results from the table count: head(table(soep$id)) # google: "base r keep only first of group" # https://lmgtfy.com/?q=base+r+keep+only+first+of+group # first hit: https://stackoverflow.com/questions/13279582/select-the-first-row-by-group soep_1obs <- soep[!duplicated(soep$id),] table(soep_1obs$sex) # 0 1 # 1652 1898 # Compare frequencies prop.table(table(soep$sex)) prop.table(table(soep_1obs$sex)) # When we ignore the panel data format it's called pooled data analysis table(soep$no_kids) # Remember missing values? table(soep$no_kids, useNA="always") # Check the help, section "Note". # It gives you advice on good data visualization practice. ?pie # Real system missings are not allowed by default. # How can you make pie() working? pie(soep$no_kids) # Remember which command only outputs the count of number of children? table(soep$no_kids) # Surround it by the pie command! pie(table(soep$no_kids)) # Also display the missings pie(table(soep$no_kids, useNA="always")) # A much better way of data visualization barplot(table(soep_1obs$no_kids)) # Improve labels and layout barplot(table(soep_1obs$no_kids), xlab="Number of Children", ylab="Absolute Frequency", main="Barplot of Number of Children", sub="SOEP practice data set.") ####################################################### ######### Univariate and bivariate analysis ########### ####################################################### # One boxplot boxplot(soep$education) # A grouped boxplot boxplot(education ~ no_kids, data=soep) # A boxplot is a very good summary of the data. # Still it hides the exact data especially the amount. boxplot(education ~ no_kids, data=soep, lwd = 2, ylab = 'Years of Education') stripchart(education ~ no_kids, data=soep, vertical = TRUE, method = "jitter", add = TRUE, pch = 20, col = 'blue') # Histogram hist(soep$education) # Create a png file from this plot png("my_first_histogram_0123456.png", width = 800, height = 600) hist(soep$education, main="Histogram", xlab="Number of Years of Education", ylab="Count", col="red") dev.off() # Correlation for two variables cor(x=soep$education, y=soep$satisf_org, method="pearson", use="complete.obs") # Correlation matrix for entire dataset round(cor(soep, use="complete.obs"),3)