I ran the code with the simulated data and then commented out the simulation. # Question 2. For this assignment, I am using an open souce data set about the titanic. The variable that I assign as myVar is age. I am interested to see how age distribution on the ship compares to my expectation about age structures in developped countries. Beyond not expecting there to be an over abundance of super old or super young individuals, I do not have many assumptions about the fit of the data.
# read in libraries relevant for running this code
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# read in data set
# open source data set about the titanic
z <- read.csv("titanic.csv")
z <- na.omit(z)
str(z)
## 'data.frame': 714 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : int 0 1 1 1 0 0 0 1 1 1 ...
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ SibSp : int 1 1 0 1 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
## - attr(*, "na.action")= 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
summary(z)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:714
## 1st Qu.:222.2 1st Qu.:0.0000 1st Qu.:1.000 Class :character
## Median :445.0 Median :0.0000 Median :2.000 Mode :character
## Mean :448.6 Mean :0.4062 Mean :2.237
## 3rd Qu.:677.8 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
## Sex Age SibSp Parch
## Length:714 Min. : 0.42 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.0000 Median :0.0000
## Mean :29.70 Mean :0.5126 Mean :0.4314
## 3rd Qu.:38.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :80.00 Max. :5.0000 Max. :6.0000
## Ticket Fare Cabin Embarked
## Length:714 Min. : 0.00 Length:714 Length:714
## Class :character 1st Qu.: 8.05 Class :character Class :character
## Mode :character Median : 15.74 Mode :character Mode :character
## Mean : 34.69
## 3rd Qu.: 33.38
## Max. :512.33
z$myVar <- z$Age
# assigns age as column new name myVar
# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# summary(z$myVar)
p1 <- ggplot(data=z, aes(x=myVar, y=after_stat(..density..))) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 29.6991176 14.5163212
## ( 0.5432597) ( 0.3841426)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 29.7 14.5
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.543 0.384
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.295 0 0 0.148
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 714
## $ loglik : num -2923
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 29.69912
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..),
fun = dnorm, colour="red",
n = length(z$myVar),
args = list(mean = meanML, sd = sdML))
p1 + stat
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
The data that best approximates my data is the normal distribution (I was between the normal distributuion and the beta distribution, but I think the normal distribution captures the lower number of observations at the beginning of the dataset).
Simulate a new data set using the best distribution observed from this data.
length(z$myVar) # 714
## [1] 714
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 29.6991176 14.5163212
## ( 0.5432597) ( 0.3841426)
# mean sd
# 29.6991176 14.5163212
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 29.7 14.5
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.543 0.384
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.295 0 0 0.148
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 714
## $ loglik : num -2923
## - attr(*, "class")= chr "fitdistr"
mean_sim <- normPars$estimate["mean"]
sd_sim <- normPars$estimate["sd"]
z$simdat <- rnorm(n = length(z$myVar), mean = mean_sim, sd = sd_sim)
s1 <- ggplot(z, aes(x=simdat, y=after_stat(..density..))) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(s1)
s1 <- s1 + geom_density(linetype="dotted",size=0.75)
print(s1)
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
# same from earlier because distribution was generated with this information
xval_s1 <- seq(0,max(z$simdat),len=length(z$simdat))
stat_s1 <- stat_function(aes(x = xval_s1, y = ..y..),
fun = dnorm, colour="red",
n = length(z$simdat),
args = list(mean = meanML, sd = sdML))
s1 + stat_s1