Bootstrap method

Matlab code:

%example 1: univariate data, heights of women

if 1

%generate random data

D = 64 + 3*randn(20,1);

N = length(D);

figure(1); clf; set(gcf,’color’,’w’,’position’,[93 330 1267 420]);

%display raw data points

subplot(1,3,1); set(gca,’fontsize’,18);

plot(D,ones(N,1),’*r’);

box off

xlabel(‘height in inches’);

axis([50 80 .9 1.1]);

set(gca,’ytick’,[]);

title(‘Raw data’);

%display data as a histogram

subplot(1,3,2); set(gca,’fontsize’,18);

hist(D,[50:.1:80]);

box off

xlabel(‘height in inches’);

ylabel(‘frequency’);

axis([50 80 0 10]);

title(‘Histogram of data’);

%display mean with errorbar representing 2 std errors

subplot(1,3,3); set(gca,’fontsize’,18);

mybar(mean(D),2*std(D)/sqrt(N));

axis([.5 1.5 50 80]);

set(gca,’xticklabel’,[]);

box off

ylabel(‘height in inches’);

title(‘Mean +/- 2*SEM’);

%bootstrap samples

%take a single bootstrap sample by sampling N items from D with replacement

figure(2); clf; set(gcf,’color’,’w’,’position’,[93 330 900 420]);

subplot(1,2,1);

set(gca,’fontsize’,18);

plot(D,ones(N,1),’*r’);

box off

xlabel(‘height in inches’);

axis([50 80 .9 1.1]);

set(gca,’ytick’,[]);

title(‘Raw data’);

hold on

%superimpose bootstrap sample on D

% d = randsample(D,N);

d = randsample(D,N);

subplot(1,2,1); set(gca,’fontsize’,18);

plot(d,1.02*ones(N,1),’or’);

%mark position of mean

subplot(1,2,2); set(gca,’fontsize’,18);

plot(mean(d),1, ‘.r’,’markersize’,25);

axis([60 70 .9 1.1]); box off; set(gca,’ytick’,[]);

hold on

%do it again 5 times

for i=1:5

d = randsample(D,N);

subplot(1,2,1);

plot(d,(1.02+i/100)*ones(N,1),’or’);

%mark position of mean

subplot(1,2,2);

plot(mean(d),1, ‘.r’,’markersize’,25);

axis([60 70 .9 1.1]);

end

%compute bootstrap standard error of sample mean

ms = [];

for i=1:10000

d = randsample(D,N);

ms(i) = mean(d);

end

%plot histogram of bootstrap means

hist(ms,[60:.5:70]);

axis([60 70 0 5000]);

box off

%find and display 95% CI

ci1 = prctile(ms,2.5);

ci2 = prctile(ms,97.5);

hold on

plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

title(‘Distribution of bootstrap means’);

end

%comparing two samples (heights of women and men)

if 1

%generate random data

WH = 64 + 3*randn(10,1);

N = length(WH);

MH = 70 + 4*randn(N,1);

figure(3); clf; set(gcf,’color’,’w’,’position’,[93 330 1267 420]);

%display raw data points

subplot(1,2,1); set(gca,’fontsize’,18);

plot(WH,ones(N,1),’*r’);

hold on

plot(MH,1.03*ones(N,1),’*b’);

box off

xlabel(‘height in inches’);

axis([50 80 .9 1.1]);

set(gca,’ytick’,[]);

title(‘Raw data’);

%display mean with errorbar representing 2 std errors

subplot(1,2,2); set(gca,’fontsize’,18);

mybar([mean(WH) mean(MH)],[2*std(WH) 2*std(MH)]/sqrt(N),[],[],[1 0 0; 0 0 1]);

axis([.5 2.5 50 80]);

set(gca,’xticklabel’,[]);

box off

ylabel(‘height in inches’);

title(‘Mean +/- 2*SEM’);

%difference in means

DM = mean(MH) – mean(WH)

%how do we compute a confidence interval for this?

%bootstrap the samples many times and compute the difference of means

%each time, then look at the distribution

dms = [];

for i=1:10000

d1 = randsample(WH,N);

d2 = randsample(MH,N);

dms(i) = mean(d2) – mean(d1);

end

%plot histogram of bootstrap difference of means

hist(dms);

box off

%find and display 95% CI

ci1 = prctile(dms,2.5);

ci2 = prctile(dms,97.5);

hold on

plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

title(‘Distribution of bootstrap difference of means’);

end

%example 3: correlation of height & weight of 20 men

H = 70 + 4*randn(100,1);

W = 2.3*H + 5*randn(100,1);

figure(4); clf; set(gcf,’color’,’w’,’position’,[93 330 1267 420]);

%display raw data points

subplot(1,2,1); set(gca,’fontsize’,18);

plot(H,W,’*b’);

box off

xlabel(‘height in inches’);

ylabel(‘weight in pounds’);

axis([50 100 100 250]);

% set(gca,’ytick’,[]);

title(‘Raw data’);

%calculate sample correlation (pearson’s r)

r = corr(H,W)

%is it significant? what is the 95% ci for r?

rs = [];

for i=1:1000

d = randsample(1:N,N);

d1 = H(d);

d2 = W(d);

rs(i) = corr(d1,d2);

end

%plot histogram of bootstrap difference of means

subplot(1,2,2); set(gca,’fontsize’,18);

hist(rs,[-1:.1:1]);

box off

%find and display 95% CI

ci1 = prctile(rs,2.5);

ci2 = prctile(rs,97.5);

hold on

plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

title(‘Distribution of bootstrap correlation values’);

% % %display data as a histogram

% % subplot(1,3,2); set(gca,’fontsize’,18);

% % hist(D,[50:2.5:80]);

% % box off

% % xlabel(‘height in inches’);

% % ylabel(‘frequency’);

% % axis([50 80 0 10]);

% % title(‘Histogram of data’);

% %

% % %display mean with errorbar representing 2 std errors

% % subplot(1,3,3); set(gca,’fontsize’,18);

% % mybar(mean(D),2*std(D)/sqrt(N));

% % axis([.5 1.5 50 80]);

% % set(gca,’xticklabel’,[]);

% % box off

% % ylabel(‘height in inches’);

% % title(‘Mean +/- 2*SEM’);

% %