-
Notifications
You must be signed in to change notification settings - Fork 0
/
Probability and Discrete Distributions.R
207 lines (136 loc) · 7.18 KB
/
Probability and Discrete Distributions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# Week 3 Demonstration Code Predict 401
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Examples of calculations using the classical method of assigning probabilities.
# marginal, union, joint and conditional probabilities (Section 4.4 Black)
# Problem: Each of three people are presented with 5 different soft drinks in random
# order. What is the probability the three people pick the same drink at random?
# This is sampling with replacement.
number.of.possible.outcomes <- 5*5*5
number.of.outcomes.with.agreement <- 5
number.of.outcomes.with.agreement/number.of.possible.outcomes
# Problem: One soft drink is preferred by the manufacturer. Three people pick
# two soft drinks from 5 selections which include the preferred soft drink. What
# is the probability none of the people pick the preferred soft drink at random?
# This is sampling without replacement.
# First, calculate the number of combinations for a single person.
combn(5,2)
x <- c("a","b","c","d","e")
dim(combn(x,2))[2]
total.number.combinations <- (dim(combn(x,2))[2])^3
# Second, remove the preferred drink and calculate the number of combinations for
# a single person. Then raise to the third power for three people.
y <- c("b","c","d","e")
dim(combn(y,2))[2]
reduced.number.combinations <- (dim(combn(y,2))[2])^3
reduced.number.combinations/total.number.combinations
#-------------------------------------------------------------------------------
# The following examples use the Hospital.csv data set.
hospital <- read.csv(file.path("c:/RBlack/","Hospital.csv"),sep=",")
str(hospital)
# To generate a table with margins, it is necessary to convert the variables to factors.
# In this case, this is equivalent to generating nominal variables for table construction.
# Simultaneously the variables will be labeled for easy of interpretation.
control <- factor(hospital$Control, labels = c("G_NFed","NG_NP","Profit","F_GOV"))
region <- factor(hospital$Geog..Region, labels = c("So","NE","MW",'SW',"RM","CA","NW"))
control_region <- table(control, region)
# Add marginal totals and rename for simplicity. Print the table.
# The table frequencies can be indexed by row and column.
mcr <- addmargins(control_region)
mcr
# Demonstration of calculations in Figure 4.7 of Business Statistics by Black.
# Marginal probability hospital is for profit?
mcr[3,8]/mcr[5,8]
# Union probability Rocky Mountain or Not Government Not for-profit?
(mcr[5,5]+mcr[2,8]-mcr[2,5])/mcr[5,8]
# Intersection probability for-profit in California?
mcr[3,6]/mcr[5,8]
# Conditional probability hospital is in Midwest if for-profit?
mcr[3,3]/mcr[3,8]
# Conditional probability hospital is government non-federal if in the South?
mcr[1,1]/mcr[5,1]
# Note that a conditional probability is a ratio of a single cell total (which is
# the intersection of two events) divided by a marginal total.
# Extra problem: Probability NG_NP but not Regions NE or MW?
(mcr[2,8]-mcr[2,2]-mcr[2,3])/mcr[2,8]
# Bayes' Theorem Calculation
# What is the probability a hospital is a for profit hospital if it is in MW?
# Refer to Section 4.7 Black for the formula. The numerator in
# that formula gives us the intersection probability for a MW for-profit hospital.
# The denominator sums up the intersection probabilities to provide the
# probability of a MW hospital.
mcr
numerator <- (mcr[3,8]/mcr[5,8])*(mcr[3,3]/mcr[3,8])
# The numerator is the intersection probability of MW and for-profit.
marginal <- mcr[,8]/mcr[5,8]
conditional <- mcr[,3]/mcr[,8]
intersection <- marginal*conditional
numerator/sum(intersection[1:4])
# The denominator sums up all the intersection probabilities of hospitals in the
# MW to obtain the probability of a MW hospital.
# As a check calculate the conditional probability directly.
mcr[3,3]/mcr[5,3]
# Compare to the unconditional marginal probability. There is a difference.
mcr[3,8]/mcr[5,8]
# Hospitals by type of service: general or psychiatric.-------------------------
# Breakdown of hospitals by service: general hospital=1, psychiatric=2.
# Create a factor out of Service and form a table.
service <- factor(hospital$Service, labels = c("medical", "psychiatric"))
service <- addmargins(table(service))
p <- service[2]/service[3]
p
# Demonstration of differences and similarities between dbinom() and dhyper().
dbinom(5,10,p)
# The binomial probability reflects sampling with replacement.
dhyper(5,32,168,10)
# The hypergeometric probability reflects sampling without replacement.
x <- seq(0,10)
par(mfrow = c(2,1))
plot(x,dbinom(x, 10 , prob = p), type = "h", main = "Binomial Probabilities", col = "red")
points(x,dbinom(x, 10 , prob = p), pch = 16, cex = 1)
plot(x,dhyper(x, 32 , 168, 10), type = "h",main = "Hypergeometric Probabilities", col = "blue")
points(x,dhyper(x, 32 , 168, 10), pch = 16, cex = 1)
par(mfrow = c(1,1))
par(mfrow = c(2,1))
plot(x,pbinom(x, 10 , prob = p), type = "h", main = "Binomial cdf", col = "red")
abline(h = 0.75, col = "green")
points(x,pbinom(x, 10 , prob = p), pch = 16, cex = 1)
plot(x,phyper(x, 32 , 168, 10), type = "h", main = "Hypergeometric cdf" , col = "blue")
abline(h = 0.75, col = "green")
points(x,phyper(x, 32 , 168, 10), pch = 16, cex = 1)
par(mfrow = c(1,1))
# With a discrete distribution it is difficult to divide the probabilities equally.
qbinom(0.75, 10, prob = p, lower.tail = TRUE)
qhyper(0.75, 32 , 168, 10, lower.tail = TRUE)
# By definition the pth quantile is that number x such that two conditions are met.
# P[X < x] <= p and P[X <= x] >= p.
# Poisson distribution in R-----------------------------------------------------
# Find out more using help(dpois). lambda is the expected or average value.
help(dpois)
# Using n = 10 and p = 0.16 from before we can choose lambda = 1.6 and compare.
par(mfrow = c(2,1))
plot(x,dpois(x, lambda = 1.6), type = "h", main = "Poisson Probabilities", col = "red")
points(x,dpois(x, lambda = 1.6), pch = 16, cex = 1)
plot(x,ppois(x, lambda = 1.6), type = "h", main = "Poisson cdf" , col = "blue")
points(x,ppois(x, lambda = 1.6), pch = 16, cex = 1)
abline(h = c(0.25, 0.5, 0.75), col = "green")
par(mfrow = c(1,1))
qpois(c(0.25, 0.5, 0.75), lambda = 1.6, lower.tail = TRUE)
# By definition the pth quantile is that number x such that two conditions are met.
# P[X < x] <= p and P[X <= x] >= p.
#-------------------------------------------------------------------------------
# Some caution is needed when using library functions
p <- 0.4 # Assume a binomial distribution with 500 trials.
# Suppose the probability of less than 225 successes is desired.
# pbinom includes 225 in the lower tail if lower.tail = TRUE. Must subtract 1 from 225.
pbinom(224,500,p,lower.tail=TRUE)
# This is a check. Note the following calculation which sums the lower tail.
x <- c(seq(0,224,by = 1))
sum(dbinom(x,500,p))
# If 225 or less is desired than 225 can be included in the function.
pbinom(225,500,p,lower.tail=TRUE )
# This is a check. Note the following calculation which sums the lower tail.
x <- c(seq(0,225,by = 1))
sum(dbinom(x,500,p))
# It is important to keep in mind the exclusion or inclusion of values.
#-------------------------------------------------------------------------------