Uploaded by Sanjay Devaraj

prob da

advertisement
Lab Assignment 5
Sanjay D
PROBABILITY AND STAISTICS
LAB L3+L4
#LARGE SAMPLE TEST
#Prob1
#H0:mu=100000, H1: mu>10000
xbar=9900
mu0=10000
sigma=120
n=30
z=(xbar-mu0)/(sigma/sqrt(n))
z
## [1] -4.564355
alpha=0.05
z.alpha=qnorm(1-alpha)
z.alpha
## [1] 1.644854
#The absolute value(4.564) is greater than the critical value(1.644), hence
H0 is rejected.
#we accept the claim that mean lifetime of a light bulb is above 10,000
hours.
#Prob2
#H0: mu=2, H1: mu<=2
xbar=2.1
mu0=2
sigma=0.25
n=35
z=(xbar-mu0)/(sigma/sqrt(n))
z
## [1] 2.366432
alpha=0.05
z.alpha=qnorm(1-alpha)
z.alpha
## [1] 1.644854
#The test statistic 2.3664 is greater than the critical value of 1.6449.
#We accept the claim that there is at most 2 grams of saturated fat in a
cookie.
#Prob3
#H0: mu=15.4, H1: mu not equal to 4
xbar=14.6
mu0=15.4
sigma=2.5
n=35
z=(xbar-mu0)/(sigma/sqrt(n))
z
## [1] -1.893146
alpha=0.05
z.alpha=qnorm(1-alpha)
z.alpha
## [1] 1.644854
pval=2*pnorm(z)
pval
## [1] 0.05833852
#SMALL SAMPLE TEST
#Prob1
#H0: mu=0.3, H1: mu>0.3
x=c(0.593,0.142,0.329, 0.691, 0.231, 0.793, 0.519, 0.392, 0.418)
t.test(x,alternative = "greater",mu=0.3)
##
## One Sample t-test
##
## data: x
## t = 2.2051, df = 8, p-value = 0.02927
## alternative hypothesis: true mean is greater than 0.3
## 95 percent confidence interval:
## 0.3245133 Inf
## sample estimates:
## mean of x
## 0.4564444
#From the output we see that the p-value is less than 0.05, hence we reject
H0
#Hence, there is moderately strong evidence that the mean Salmonella level in
the ice cream is above 0.3MPN/g
#Prob2
#H0:there is no significant difference, H1:There is significant difference
x=c(65, 78, 88, 55, 48, 95, 66, 57, 79, 81)
t.test(x,mu=75)
##
## One Sample t-test
##
## data: x
## t = -0.78303, df = 9, p-value = 0.4537
## alternative hypothesis: true mean is not equal to 75
## 95 percent confidence interval:
## 60.22187 82.17813
## sample estimates:
## mean of x
## 71.2
#the p-value with a significance level of 95%. If p-value is lesser than 0.05
hence we reject H0.
#Prob3
#H0:there is no significant difference, H1:There is significant difference
x=c(175,168,168,190,156,181,182,175,174,179)
y=c(185,169,173,173,188,186,175,174,179,180)
t.test(x,y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = -0.94737, df = 15.981, p-value = 0.3576
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.008795 4.208795
## sample estimates:
## mean of x mean of y
## 174.8 178.2
#The p-value > 0.05, we conclude that the means of the two groups are
significantly similar. We accept H0.
#Prob4
#H0:there is no significant difference, H1:There is significant difference
x=c(15,12,13,79,8,21,9,14,8)
y=c(15,14,12,8,14,7,16,10,15,2)
t.test(x,y,alternative = "less")
##
## Welch Two Sample t-test
##
## data: x and y
## t = 1.1231, df = 8.5739, p-value = 0.8541
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 22.68686
## sample estimates:
## mean of x mean of y
## 19.88889 11.30000
#P value (0.3002) > 0.05, we accept H0.
#Prob5
#H0:there is no significant improvement to the team of athletes. H1: there is
significant improvement
x=c(12.9,13.5,12.8,15.6,17.2,19.2,12.6,15.3,14.4,11.3)
y=c(12.7,13.6,12.0,15.2,16.8,20.0,12.0,15.9,16.0,11.1)
t.test(x,y,paired = TRUE)
##
## Paired t-test
##
## data: x and y
## t = -0.21331, df = 9, p-value = 0.8358
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.5802549 0.4802549
## sample estimates:
## mean difference
## -0.05
#The p-value is greater than 0.05, then we accept H0
#The new training has not made any significant improvement to the team of
athletes.
#Prob6
#H0:there is no significant improvement to the team of athletes. H1: there is
significant improvement
x=c(12.9,13.5,12.8,15.6,17.2,19.2,12.6,15.3,14.4,11.3)
y=c(12.0,12.2,11.2,13.0,15.0,15.8,12.2,13.4,12.9,11.0)
t.test(x,y,paired = TRUE, alternative = "less")
##
## Paired t-test
##
## data: x and y
## t = 5.2671, df = 9, p-value = 0.9997
## alternative hypothesis: true mean difference is less than 0
## 95 percent confidence interval:
## -Inf 2.170325
## sample estimates:
## mean difference
## 1.61
#The p value is greater than 0.05, hence we reject H0
#Prob7
#H0:drug doesnt lower cholestrol, H1:drug lowers cholestrol
x=c(237,289,257,228,303,275,262,304,244,233)
y=c(194,240,230,186,265,222,242,281,240,212)
t.test(x,y,paired=TRUE,alternative = "greater",mu=0)
##
## Paired t-test
##
## data: x and y
## t = 6.5594, df = 9, p-value = 5.202e-05
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
## 23.05711 Inf
## sample estimates:
## mean difference
## 32
#We can reject H0 and support the claim because the P-value is less than 0.05
#Prob8
#H0:there is no significant difference, H1:There is significant difference
a=c(14.1,10.1,14.7,13.7,14.0)
b=c(14.0,14.5,13.7,12.7,14.1)
var.test(a,b)
##
## F test to compare two variances
##
## data: a and b
## F = 7.3304, num df = 4, denom df = 4, p-value = 0.07954
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7632268 70.4053799
## sample estimates:
## ratio of variances
## 7.330435
#Here p valuve >0.05 ,then we accept H0.
#Practice problem 1
#H0: there is an increased blood pressure
x=c(5, 2, 8, -1, 3, 0, -2, 1, 5, 0, 4, 6, 8)
t.test(x, mu = 0, alternative = "greater")
## One Sample t-test
##
## data: x
## t = 3.2613, df = 12, p-value = 0.003406
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
## 1.360534 Inf
## sample estimates:
## mean of x
## 3
#The p-value is lessser than 0.05, hence H0 is rejected
#There is no proper evidence that the blood pressure is increased
#Practice problem 2
#H0:mu=25, H1:mu not equal to 25
x=c(24, 20, 30, 20, 20, 18)
t.test(x,mu=25)
##
## One Sample t-test
##
## data: x
## t = -1.6771, df = 5, p-value = 0.1544
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
## 17.4016 26.5984
## sample estimates:
## mean of x
## 22
#The p value is greater than 0.01, hence the H0 is accepted
#Mu=25
#Practice Problem 3
#H0: mu=4000, H1: mu not equal to 40000
x=c(4.2,4.6,3.9,4.1,5.2,3.8,3.9,4.3,4.4,5.6 )
t.test(x,mu=4)
##
## One Sample t-test
##
## data: x
## t = 2.1483, df = 9, p-value = 0.0602
## alternative hypothesis: true mean is not equal to 4
## 95 percent confidence interval:
## 3.978809 4.821191
## sample estimates:
## mean of x
## 4.4
#The p value is greater than 0.05, hence we accpet H0.
#Practice problem 4
#H0:these two treatements have identical effects, H1:they do not have
identical effects
x=c(2.0,2.7,2.9,1.9,2.1,2.6,2.7,2.9,3.0,2.6,2.6,2.7)
y=c(3.2,3.6,3.7,3.5,2.9,2.6,2.5,2.7)
t.test(x,y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = -2.6676, df = 12.294, p-value = 0.02014
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.96022459 -0.09810875
## sample estimates:
## mean of x mean of y
## 2.558333 3.087500
#the p-value is lesser than 0.05, hence we reject the H0
#The treatments do not have identical effects
#GOODNESS OF FIT
#Prob1
#H0: the anxiety level and family type is independent, H1: the anxiety level
and family type isnt independent
data=matrix(c(35,42,61,48,51,68),ncol=3,byrow=T)
data
## [,1] [,2] [,3]
## [1,] 35 42 61
## [2,] 48 51 68
chisq.test(data)
##
## Pearson's Chi-squared test
##
## data: data
## X-squared = 0.53441, df = 2, p-value = 0.7655
#the p value > 0.05, we accept H0
#Prob2
library(MASS)
tbl=table(survey$Smoke, survey$Exer)
tbl
##
## Freq None Some
## Heavy 7 1 3
## Never 87 18 84
## Occas 12 3 4
## Regul 9 1 7
ctbl=cbind(tbl[,"Freq"],tbl[,"None"]+tbl[,"Some"])
ctbl
## [,1] [,2]
## Heavy 7 4
## Never 87 102
## Occas 12 7
## Regul 9 8
chisq.test(ctbl)
##
## Pearson's Chi-squared test
##
## data: ctbl
## X-squared = 3.2328, df = 3, p-value = 0.3571
#Prob3
plants=c(20,10,7,4)
chisq.test(plants,p=c(9/16,3/16,3/16,1/16))
## Warning in chisq.test(plants, p = c(9/16, 3/16, 3/16, 1/16)): Chi-squared
## approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: plants
## X-squared = 1.9702, df = 3, p-value = 0.5786
#Here the probability value p is greater than alpha level (0.05), so we
accept H0.
#Prob4
#H0:it fits binomial distribution, H1:it doesn't fit binomial disttribution
x=c(5,4,3,2,1,0)
n=5
N=320
P=0.5
obf=c(14,56,110,88,40,12)
exf=dbinom(x,n,P)*N
sum(obf)
## [1] 320
sum(exf)
## [1] 320
chisq=sum((obf-exf)^2/exf)
chisq
## [1] 7.16
qchisq(0.95,5)
## [1] 11.0705
#The calculated value is less than the critical value, hence we reject H0
#Prob5
#H0:it fits poisson distribution, H1:it doesn't fit poisson disttribution
x=0:6
f=c(275,72,30,7,5,2,1)
lambda=(sum(f*x)/sum(f))
expf=dpois(x,lambda)*sum(f)
f1=round(expf)
sum(f)
## [1] 392
sum(f1)
## [1] 393
obf=c(275,72,30,15)
exf=c(242,117,28,6)
chisq=sum(((obf-exf)^2)/exf)
qchisq(0.95,2)
## [1] 5.991465
#Since the calculated value is greater than critical value, we reject H0.
#Prob6
#H0:it fits normal distribution, H1:it doesn't fit normal disttribution
midy=seq(17.05,86.5,length=10)
f=c(2,10,16,37,43,39,29,13,6,5)
mean=sum(f*midy)/sum(f)
sd=sqrt(sum(f*midy-mean)^2)/sum(f)
l=seq(13.2,82.5,length=10)
l=c(1,90.2)
cdf=pnorm(l,mean,sd)
cdf=c(0,cdf,1)
pcf=diff(cdf)
f=c(0,f,0)
ex=round(pcf*sum(f),4)
fr=data.frame(f,ex)
obf=c(12,16,37,43,39,29,13,11)
exf=c(sum(ex[c(1,2,3)]),ex[c(4:9)],sum(ex[c(10,11,12)]))
sum(obf)
## [1] 200
sum(exf)
## [1] NA
chisq=sum((obf-exf)^2/exf)
chisq
## [1] NA
qchisq(0.95,5)
## [1] 11.0705
#The calculated value is less than the critical value, hence we accept H0.
Practice Problems
1 A particular brand of tires claims that its deluxe tire averages at least 50,000 milesbefore
it needs to be replaced. From past studies of this tire, the standard deviationis known to be
8000. A survey of owners of that tire design is conducted. From the28 tires surveyed, the
average lifespan was 46,500 miles with a standard deviationof 9800 miles. Do the data
support the claim at the 5% level?
# Given data
sample_mean <- 46500 # average lifespan from the survey
population_mean <- 50000 # claimed average lifespan
sample_size <- 28
sample_sd <- 9800 # standard deviation from the survey
# Calculate the standard error
standard_error <- sample_sd / sqrt(sample_size)
# Calculate the t-statistic
t_statistic <- (sample_mean - population_mean) / standard_error
# Degrees of freedom
df <- sample_size - 1
# Calculate the critical t-value for a one-tailed test at alpha = 0.05
critical_t <- qt(0.05, df, lower.tail = FALSE)
# Print the t-statistic and critical t-value
cat("t-statistic:", t_statistic, "\n")
## t-statistic: -1.889822
cat("Critical t-value:", critical_t, "\n")
## Critical t-value: 1.703288
# Perform the t-test and print the result
if (t_statistic < critical_t) {
cat("Reject the null hypothesis. The data supports the claim.\n")
} else {
cat("Fail to reject the null hypothesis. The data does not support the
claim.\n")
}
## Reject the null hypothesis. The data supports the claim.
2 In the large city A,20 per cent of Random sample of 900 School children had defective eye
–sight. In the large city B,15 percent of random sample of 1600 school children had the
same defective. Is this Difference between the two Proportions Significant? Obtain 95%
confidence limits of the difference in the population proportions.
# Given data for City A
sample_size_A <- 900
defective_A <- 0.20 * sample_size_A
eyesight in City A
# Given data for City B
sample_size_B <- 1600
defective_B <- 0.15 * sample_size_B
eyesight in City B
# Proportions
p_A <- defective_A / sample_size_A
City A
p_B <- defective_B / sample_size_B
City B
# Number of children with defective
# Number of children with defective
# Proportion of defective eyesight in
# Proportion of defective eyesight in
# Standard error of the difference in proportions
SE_diff <- sqrt((p_A * (1 - p_A)) / sample_size_A + (p_B * (1 - p_B)) /
sample_size_B)
# Z-score for 95% confidence level
z <- qnorm(0.975) # Two-tailed test
# Calculate the difference in proportions
diff_proportions <- p_A - p_B
# Confidence interval for the difference in proportions
lower_limit <- diff_proportions - z * SE_diff
upper_limit <- diff_proportions + z * SE_diff
# Print the results
cat("Difference in proportions:", diff_proportions, "\n")
## Difference in proportions: 0.05
cat("95% Confidence Interval for the difference in proportions:",
lower_limit, "to", upper_limit, "\n")
## 95% Confidence Interval for the difference in proportions: 0.01855096 to
0.08144904
3 A cigarette manufacturing firm claims its brand A of the cigarettes outsells its brand B by
8%.if its found that 42 out sample of 200 smoker prefer brand A and 18 out of another
random sample of 100 smokers prefers brand B, test whether the 8% difference is a valid
cliam.
# Given data
n_A <- 200 # Sample size for brand A
n_B <- 100 # Sample size for brand B
x_A <- 42
# Number of smokers preferring brand A
x_B <- 18
# Number of smokers preferring brand B
# Proportions
p_A <- x_A / n_A
p_B <- x_B / n_B
# Proportion of smokers preferring brand A
# Proportion of smokers preferring brand B
# Null hypothesis: There is no difference in proportions (p_A - p_B = 0.08)
# Alternative hypothesis: There is a difference in proportions (p_A - p_B ≠
0.08)
# Standard error of the difference in proportions
SE_diff <- sqrt((p_A * (1 - p_A)) / n_A + (p_B * (1 - p_B)) / n_B)
# Test statistic (z-score)
z <- ((p_A - p_B) - 0.08) / SE_diff
# p-value for two-tailed test
p_value <- 2 * pnorm(-abs(z))
# Print the test statistic and p-value
cat("Test Statistic (z):", z, "\n")
## Test Statistic (z): -1.041328
cat("p-value:", p_value, "\n")
## p-value: 0.2977235
# Test the hypothesis at a significance level of 0.05
if (p_value < 0.05) {
cat("Reject the null hypothesis. There is evidence to suggest that the
claim is not valid.\n")
} else {
cat("Fail to reject the null hypothesis. There is not enough evidence to
reject the claim.\n")
}
## Fail to reject the null hypothesis. There is not enough evidence to reject
the claim.
4 The average number of sick days an employee takes per year is believed to be about 10.
Members of a personnel department do not believe this figure. They randomly survey
8employees. The number of sick days they took for the past year are as follows: 12; 4; 15;
3; 11; 8; 6; 8. Let X = the number of sick days they took for the past year. Should
thepersonnel team believe that the average number is about 10?
# Given data
sick_days <- c(12, 4, 15, 3, 11, 8, 6, 8)
employees
# Number of sick days taken by 8
# Calculate the sample mean
sample_mean <- mean(sick_days)
# Null hypothesis: The average number of sick days is 10
# Alternative hypothesis: The average number of sick days is not 10
# Conduct a one-sample t-test
t_test_result <- t.test(sick_days, mu = 10)
# Print the test result
print(t_test_result)
##
## One Sample t-test
##
## data: sick_days
## t = -1.12, df = 7, p-value = 0.2996
## alternative hypothesis: true mean is not equal to 10
## 95 percent confidence interval:
##
4.94433 11.80567
## sample estimates:
## mean of x
##
8.375
5 The mean life time of a sample of 400 fluorescent light bulbsproduced by a company is
found to be 1, 570 hours with a standarddeviation of 150 hours. Test the hypothesis that
the mean life time ofbulbs is 1600 hours against the alternative hypothesis that it is
greaterthan 1, 600 hours at 1% and 5% level of significance
# Given data
sample_mean <- 1570 # Sample mean
sample_sd <- 150
# Sample standard deviation
sample_size <- 400
# Sample size
population_mean <- 1600 # Hypothesized population mean
# Generate sample data
sample_data <- rnorm(sample_size, mean = sample_mean, sd = sample_sd)
# Conduct a one-sample t-test for 1% level of significance
t_test_result_1 <- t.test(sample_data, mu = population_mean, alternative =
"greater",
conf.level = 0.99)
# Print the test result for 1% level of significance
print(t_test_result_1)
##
## One Sample t-test
##
## data: sample_data
## t = -5.3219, df = 399, p-value = 1
## alternative hypothesis: true mean is greater than 1600
## 99 percent confidence interval:
## 1540.07
Inf
## sample estimates:
## mean of x
##
1558.35
# Conduct a one-sample t-test for 5% level of significance
t_test_result_5 <- t.test(sample_data, mu = population_mean, alternative =
"greater",
conf.level = 0.95)
# Print the test result for 5% level of significance
print(t_test_result_5)
##
## One Sample t-test
##
## data: sample_data
## t = -5.3219, df = 399, p-value = 1
## alternative hypothesis: true mean is greater than 1600
## 95 percent confidence interval:
## 1545.447
Inf
## sample estimates:
## mean of x
##
1558.35
6 A certain stimulus administered to each of the 13 patients resulted in the following
increase of blood pressure: 5, 2, 8,-1, 3, 0, -2, 1, 5, 0, 4, 6, 8. Can it be concluded that the
stimulus, in general, be accompanied by an increase in the blood pressure.
# Given data
increase <- c(5, 2, 8, -1, 3, 0, -2, 1, 5, 0, 4, 6, 8)
pressure for each patient
# Increase in blood
# Null hypothesis: The mean increase in blood pressure is zero (mu = 0)
# Alternative hypothesis: The mean increase in blood pressure is greater than
zero (mu > 0)
# Conduct a one-sample t-test
t_test_result <- t.test(increase, alternative = "greater")
# Print the test result
print(t_test_result)
##
## One Sample t-test
##
## data: increase
## t = 3.2613, df = 12, p-value = 0.003406
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
## 1.360534
Inf
## sample estimates:
## mean of x
##
3
7 The manufacturer of a certain make of electric bulbs claims that his bulbs have a mean
life of 25 months with a standard deviation of 5 months. Random samples of 6 such bulbs
have the following values: Life of bulbs in months: 24, 20, 30, 20, 20, and 18. Can you
regard the producer’s claim to valid at 1% level of significance
# Given data
sample <- c(24, 20, 30, 20, 20, 18) # Life of bulbs in months
sample_mean <- mean(sample)
# Sample mean
sample_sd <- sd(sample)
# Sample standard deviation
sample_size <- length(sample)
# Sample size
population_mean <- 25
# Claimed population mean
# Null hypothesis: The mean life of bulbs is 25 months (mu = 25)
# Alternative hypothesis: The mean life of bulbs is not 25 months (two-tailed
test)
# Conduct a one-sample t-test
t_test_result <- t.test(sample, mu = population_mean, alternative =
"two.sided", conf.level = 0.99)
# Print the test result
print(t_test_result)
##
## One Sample t-test
##
## data: sample
## t = -1.6771, df = 5, p-value = 0.1544
## alternative hypothesis: true mean is not equal to 25
## 99 percent confidence interval:
## 14.78708 29.21292
## sample estimates:
## mean of x
##
22
8 The life time of electric bulbs for a random sample of 10 from a large consignment gave
the following data: 4.2, 4.6, 3.9, 4.1, 5.2, 3.8, 3.9, 4.3, 4.4, 5.6 (in ’000 hours). Can we accept
the hypothesis that the average life time of bulbs is 4, 000 hours
# Given data
sample <- c(4.2, 4.6, 3.9, 4.1, 5.2, 3.8, 3.9, 4.3, 4.4, 5.6)
bulbs in '000 hours'
# Lifetime of
# Calculate sample statistics
sample_mean <- mean(sample) # Sample mean
sample_sd <- sd(sample)
# Sample standard deviation
sample_size <- length(sample) # Sample size
population_mean <- 4
# Hypothesized population mean (in '000 hours')
# Null hypothesis: The average lifetime of bulbs is 4,000 hours (mu = 4)
# Alternative hypothesis: The average lifetime of bulbs is not 4,000 hours
(two-tailed test)
# Conduct a one-sample t-test
t_test_result <- t.test(sample, mu = population_mean, alternative =
"two.sided")
# Print the test result
print(t_test_result)
##
## One Sample t-test
##
## data: sample
## t = 2.1483, df = 9, p-value = 0.0602
## alternative hypothesis: true mean is not equal to 4
## 95 percent confidence interval:
## 3.978809 4.821191
## sample estimates:
## mean of x
##
4.4
a=c(2,2.7,2.9,1.9,2.1,2.6,2.7,2.9,3.0,2.6,2.6,2.7)
b=c(3.2,3.6,3.7,3.5,2.9,2.6,2.5,2.7)
u=var.test(a,b)
u
##
## F test to compare two variances
##
## data: a and b
## F = 0.58045, num df = 11, denom df = 7, p-value = 0.4033
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.1232526 2.1817180
## sample estimates:
## ratio of variances
##
0.5804544
9 The following data come from a hypothetical survey of 920 people (Men, Women) that
ask for their preference of one of the three ice cream flavors (Chocolate, Vanilla,
Strawberry). Is there any association between gender and preference for ice cream flavor?
# Given data
men <- c(100, 120, 20)
strawberry
women <- c(350, 320, 150)
strawberry
# Men's preference for chocolate, vanilla,
# Women's preference for chocolate, vanilla,
# Combine the data into a matrix
ice_cream_data <- rbind(men, women)
# Perform chi-square test for independence
chi_square_test <- chisq.test(ice_cream_data)
# Print the test result
print(chi_square_test)
##
## Pearson's Chi-squared test
##
## data: ice_cream_data
## X-squared = 16.916, df = 2, p-value = 0.0002122
10 As a part of quality improvement project focused on a delivery of mail at a department
office within a large company, data were gathered on the number of different addresses
that had to be changed so that the mail could be redirected to thee correct mail stop. Table
shows the frequency distribution. Fit binomial distribution and test goodness of fit
# Given data
x <- 0:4
f_observed <- c(5, 20, 45, 20, 10)
# Total number of trials (sample size)
n <- sum(f_observed)
# Estimate the probability of success (p)
p_estimate <- sum(x * f_observed) / n
# Ensure that the probability estimate is within the valid range [0, 1]
p_estimate <- pmin(p_estimate, 0.99) # Set maximum value to 0.99 to avoid
numerical instability
p_estimate <- pmax(p_estimate, 0.01) # Set minimum value to 0.01 to avoid
numerical instability
# Calculate the expected frequencies using the binomial distribution formula
f_expected <- dbinom(x, size = max(x), prob = p_estimate) * n
# Normalize the expected frequencies so that they sum up to 1
f_expected <- f_expected / sum(f_expected)
# Perform goodness-of-fit test
goodness_of_fit_test <- chisq.test(f_observed, p = f_expected)
## Warning in chisq.test(f_observed, p = f_expected): Chi-squared
approximation
## may be incorrect
# Print the test result
print(goodness_of_fit_test)
##
## Chi-squared test for given probabilities
##
## data: f_observed
## X-squared = 26044540, df = 4, p-value < 2.2e-16
Download