Homework #6

Question 1

I ran the code with the simulated data and then commented out the simulation. # Question 2. For this assignment, I am using an open souce data set about the titanic. The variable that I assign as myVar is age. I am interested to see how age distribution on the ship compares to my expectation about age structures in developped countries. Beyond not expecting there to be an over abundance of super old or super young individuals, I do not have many assumptions about the fit of the data.

# read in libraries relevant for running this code
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# read in data set
# open source data set about the titanic
z <- read.csv("titanic.csv")
z <- na.omit(z)
str(z)

## 'data.frame':    714 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived   : int  0 1 1 1 0 0 0 1 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 1 3 3 2 3 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ SibSp      : int  1 1 0 1 0 0 3 0 1 1 ...
##  $ Parch      : int  0 0 0 0 0 0 1 2 0 1 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
##  - attr(*, "na.action")= 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
##   ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...

summary(z)

##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:714        
##  1st Qu.:222.2   1st Qu.:0.0000   1st Qu.:1.000   Class :character  
##  Median :445.0   Median :0.0000   Median :2.000   Mode  :character  
##  Mean   :448.6   Mean   :0.4062   Mean   :2.237                     
##  3rd Qu.:677.8   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##      Sex                 Age            SibSp            Parch       
##  Length:714         Min.   : 0.42   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.5126   Mean   :0.4314  
##                     3rd Qu.:38.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                     Max.   :80.00   Max.   :5.0000   Max.   :6.0000  
##     Ticket               Fare           Cabin             Embarked        
##  Length:714         Min.   :  0.00   Length:714         Length:714        
##  Class :character   1st Qu.:  8.05   Class :character   Class :character  
##  Mode  :character   Median : 15.74   Mode  :character   Mode  :character  
##                     Mean   : 34.69                                        
##                     3rd Qu.: 33.38                                        
##                     Max.   :512.33

z$myVar <- z$Age

# assigns age as column new name myVar

quick and dirty, a truncated normal distribution to work on the solution set

# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# summary(z$myVar)

p1 <- ggplot(data=z, aes(x=myVar, y=after_stat(..density..))) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean          sd    
##   29.6991176   14.5163212 
##  ( 0.5432597) ( 0.3841426)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 29.7 14.5
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.543 0.384
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.295 0 0 0.148
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 714
##  $ loglik  : num -2923
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##     mean 
## 29.69912

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat <- stat_function(aes(x = xval, y = ..y..),
                      fun = dnorm, colour="red",
                      n = length(z$myVar),
                      args = list(mean = meanML, sd = sdML)) 
p1 + stat

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar))) 
p1 + stat + stat2 + stat3

gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

Question 3

The data that best approximates my data is the normal distribution (I was between the normal distributuion and the beta distribution, but I think the normal distribution captures the lower number of observations at the beginning of the dataset).

Question 4

Simulate a new data set using the best distribution observed from this data.

length(z$myVar) # 714

## [1] 714

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean          sd    
##   29.6991176   14.5163212 
##  ( 0.5432597) ( 0.3841426)

# mean          sd    
#   29.6991176   14.5163212
str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 29.7 14.5
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.543 0.384
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.295 0 0 0.148
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 714
##  $ loglik  : num -2923
##  - attr(*, "class")= chr "fitdistr"

mean_sim <- normPars$estimate["mean"]
sd_sim <- normPars$estimate["sd"]

z$simdat <- rnorm(n = length(z$myVar), mean = mean_sim, sd = sd_sim)

s1 <- ggplot(z, aes(x=simdat, y=after_stat(..density..))) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(s1)

s1 <-  s1 +  geom_density(linetype="dotted",size=0.75)
print(s1)

meanML <- normPars$estimate["mean"] 
sdML <- normPars$estimate["sd"]
# same from earlier because distribution was generated with this information

xval_s1 <- seq(0,max(z$simdat),len=length(z$simdat))

stat_s1 <- stat_function(aes(x = xval_s1, y = ..y..),
                      fun = dnorm, colour="red",
                      n = length(z$simdat),
                      args = list(mean = meanML, sd = sdML)) 
s1 + stat_s1

hw6.Rmd

Emily Dombrowski

2024-02-21

Homework #6

Question 1

quick and dirty, a truncated normal distribution to work on the solution set

Question 3

Question 4