Bootstrap method

Matlab code:

 

%example 1: univariate data, heights of women

 

if 1

    

    %generate random data

    D = 64 + 3*randn(20,1);

    N = length(D);

    

    figure(1); clf; set(gcf,’color’,’w’,’position’,[93         330        1267         420]);

    

    %display raw data points

    subplot(1,3,1); set(gca,’fontsize’,18);

    plot(D,ones(N,1),’*r’);

    box off

    xlabel(‘height in inches’);

    axis([50 80 .9 1.1]);

    set(gca,’ytick’,[]);

    title(‘Raw data’);

    

    %display data as a histogram

    subplot(1,3,2); set(gca,’fontsize’,18);

    hist(D,[50:.1:80]);

    box off

    xlabel(‘height in inches’);

    ylabel(‘frequency’);

    axis([50 80 0 10]);

    title(‘Histogram of data’);

    

    %display mean with errorbar representing 2 std errors

    subplot(1,3,3); set(gca,’fontsize’,18);

    mybar(mean(D),2*std(D)/sqrt(N));

    axis([.5 1.5 50 80]);

    set(gca,’xticklabel’,[]);

    box off

    ylabel(‘height in inches’);

    title(‘Mean +/- 2*SEM’);

    

    

    %bootstrap samples

    

    %take a single bootstrap sample by sampling N items from D with replacement

    

    figure(2); clf; set(gcf,’color’,’w’,’position’,[93         330        900         420]);

    

    subplot(1,2,1);

    set(gca,’fontsize’,18);

    plot(D,ones(N,1),’*r’);

    box off

    xlabel(‘height in inches’);

    axis([50 80 .9 1.1]);

    set(gca,’ytick’,[]);

    title(‘Raw data’);

    hold on

    

    

    %superimpose bootstrap sample on D

    

%     d = randsample(D,N);

    d = randsample(D,N);

    subplot(1,2,1); set(gca,’fontsize’,18);

    plot(d,1.02*ones(N,1),’or’);

    %mark position of mean

    subplot(1,2,2); set(gca,’fontsize’,18);

    plot(mean(d),1, ‘.r’,’markersize’,25);

    axis([60 70 .9 1.1]); box off; set(gca,’ytick’,[]);

    hold on

    

    

    %do it again 5 times

    for i=1:5

        d = randsample(D,N);

        subplot(1,2,1);

        plot(d,(1.02+i/100)*ones(N,1),’or’);

        %mark position of mean

        subplot(1,2,2);

        plot(mean(d),1, ‘.r’,’markersize’,25);

        axis([60 70 .9 1.1]);

    end

    

    %compute bootstrap standard error of sample mean

    ms = [];

    for i=1:10000

        d = randsample(D,N);

        ms(i) = mean(d);

    end

    

    %plot histogram of bootstrap means

    hist(ms,[60:.5:70]);

    axis([60 70 0 5000]);

    box off

    

    

    %find and display 95% CI

    ci1 = prctile(ms,2.5);

    ci2 = prctile(ms,97.5);

    hold on

    plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

    plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

    

    title(‘Distribution of bootstrap means’);

    

end

 

 

 

%comparing two samples (heights of women and men)

if 1

    %generate random data

    WH = 64 + 3*randn(10,1);

    N = length(WH);

    MH = 70 + 4*randn(N,1);

    

    

    figure(3); clf; set(gcf,’color’,’w’,’position’,[93         330        1267         420]);

    

    %display raw data points

    subplot(1,2,1); set(gca,’fontsize’,18);

    plot(WH,ones(N,1),’*r’);

    hold on

    plot(MH,1.03*ones(N,1),’*b’);

    box off

    xlabel(‘height in inches’);

    axis([50 80 .9 1.1]);

    set(gca,’ytick’,[]);

    title(‘Raw data’);

    

    

    %display mean with errorbar representing 2 std errors

    subplot(1,2,2); set(gca,’fontsize’,18);

    mybar([mean(WH) mean(MH)],[2*std(WH) 2*std(MH)]/sqrt(N),[],[],[1 0 0; 0 0 1]);

    axis([.5 2.5 50 80]);

    set(gca,’xticklabel’,[]);

    box off

    ylabel(‘height in inches’);

    title(‘Mean +/- 2*SEM’);

    

    %difference in means

    DM = mean(MH) – mean(WH)

    

    %how do we compute a confidence interval for this?

    

    %bootstrap the samples many times and compute the difference of means

    %each time, then look at the distribution

    dms = [];

    for i=1:10000

        d1 = randsample(WH,N);

        d2 = randsample(MH,N);

        dms(i) = mean(d2) – mean(d1);

    end

 

    %plot histogram of bootstrap difference of means

    hist(dms);

    box off

    

    %find and display 95% CI

    ci1 = prctile(dms,2.5);

    ci2 = prctile(dms,97.5);

    hold on

    plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

    plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

    

    title(‘Distribution of bootstrap difference of means’);

    

end

    

 

 

 

%example 3: correlation of height & weight of 20 men

H = 70 + 4*randn(100,1);

W = 2.3*H + 5*randn(100,1);

 

figure(4); clf; set(gcf,’color’,’w’,’position’,[93         330        1267         420]);

 

%display raw data points

subplot(1,2,1); set(gca,’fontsize’,18);

plot(H,W,’*b’);

box off

xlabel(‘height in inches’);

ylabel(‘weight in pounds’);

axis([50 100 100 250]);

% set(gca,’ytick’,[]);

title(‘Raw data’);

 

%calculate sample correlation (pearson’s r)

r = corr(H,W)

 

%is it significant? what is the 95% ci for r?

rs = [];

for i=1:1000

    d = randsample(1:N,N);

    d1 = H(d);

    d2 = W(d);

    rs(i) = corr(d1,d2);

end

 

%plot histogram of bootstrap difference of means

subplot(1,2,2); set(gca,’fontsize’,18);

hist(rs,[-1:.1:1]);

box off

 

%find and display 95% CI

ci1 = prctile(rs,2.5);

ci2 = prctile(rs,97.5);

hold on

plot([ci1 ci1],[0 3000],’k–‘,’linewidth’,5);

plot([ci2 ci2],[0 3000],’k–‘,’linewidth’,5);

 

title(‘Distribution of bootstrap correlation values’);

 

 

% 

% % %display data as a histogram

% % subplot(1,3,2); set(gca,’fontsize’,18);

% % hist(D,[50:2.5:80]);

% % box off

% % xlabel(‘height in inches’);

% % ylabel(‘frequency’);

% % axis([50 80 0 10]);

% % title(‘Histogram of data’);

% % 

% % %display mean with errorbar representing 2 std errors

% % subplot(1,3,3); set(gca,’fontsize’,18);

% % mybar(mean(D),2*std(D)/sqrt(N));

% % axis([.5 1.5 50 80]);

% % set(gca,’xticklabel’,[]);

% % box off

% % ylabel(‘height in inches’);

% % title(‘Mean +/- 2*SEM’);

% %