**CODE**

% Let's generate a time-series with an upward linear trend,

% corrupted by a large amount of noise.

x = (1:50) + 30*randn(1,50);

% Visualize the time-series.

figure(999); clf;

plot(x);

xlabel('Time');

ylabel('Value');

title('Time-series data');

% Suppose we didn't know that the data came from a model

% with an upward trend. How can we test whether the observed

% trend is statistically significant?

% Let's perform a randomization test. The logic is as follows:

% The null hypothesis is that the data points have no dependence on

% time. If the null hypothesis is true, then we can randomly permute

% the data points and this should produce datasets that are equivalent

% to the original dataset. So, we will generate random datasets,

% compute a statistic from these datasets, and compare these values to

% the statistic generated from the original dataset. Our statistic will

% be the correlation (Pearson's r) between the time-series and a diagonal line.

[pval,val,dist] = randomization(x,2,@(x) calccorrelation(x,1:length(x)),10000,1);

% What we have done is to compute correlation values for 10000 randomly permuted

% datasets. Let's look at the distribution of correlation values obtained,

% and let's see where the original correlation value lies.

figure(998); clf; hold on;

hist(dist,100);

h = straightline(val,'v','r-');

xlabel('Correlation');

ylabel('Frequency');

legend(h,{'Actual correlation'});

title('Distribution of correlation values obtained via randomization');

% The random correlation values are most of the time less than the

% actual correlation value. The proportion of time that the actual

% correlation value is greater than the random values is the p-value.

ax = axis;

text(val,1.05*ax(4),sprintf('p = %.4g',pval),'HorizontalAlignment','center');

axis([ax(1:2) ax(3) 1.1*ax(4)]);

% Given the unlikeliness of obtaining the correlation value that we did,

% we can conclude that the null hypothesis is probably false and that

% the data points probably do have a dependence on time. (Note that

% strictly-speaking, we cannot claim that the specific form of the

% time-dependence is a linear trend; we can claim only that there is

% some dependence on time.)

% Let's do another example. Generate points in a two-dimensional

% space with a weak correlation between the x- and y-coordinates.

data = randnmulti(500,[],[1 .1; .1 1]);

% Visualize the data.

figure(997); clf;

scatter(data(:,1),data(:,2),'r.');

xlabel('X');

ylabel('Y');

title(sprintf('Data (r = %.4g)',calccorrelation(data(:,1),data(:,2))));

% Let's perform a randomization test to see if there is a statistically

% significant relationship between the x- and y-coordinates. If there is

% no relationship, then we can randomly permute the x-coordinates

% relative to the y-coordinates and this should produce datasets

% that are equivalent to the original dataset. Our statistic of

% interest will be the correlation (Pearson's r) between x- and

% y-coordinates of each dataset.

[pval,val,dist] = randomization(data(:,1),1,@(x) calccorrelation(x,data(:,2)),10000,1);

% What we have done is to compute correlation values for 10000 randomly permuted

% datasets. Let's look at the distribution of correlation values obtained,

% and let's see where the original correlation value lies.

figure(996); clf; hold on;

hist(dist,100);

h = straightline(val,'v','r-');

xlabel('Correlation');

ylabel('Frequency');

legend(h,{'Actual correlation'});

title('Distribution of correlation values obtained via randomization');

ax = axis;

text(val,1.05*ax(4),sprintf('p = %.4g',pval),'HorizontalAlignment','center');

axis([ax(1:2) ax(3) 1.1*ax(4)]);

% The random correlation values are most of the time less than the

% actual correlation value. The proportion of time that the actual

% correlation value is greater than the random values is the p-value.

% The null hypothesis (that there is no dependence between the x-

% and y-coordinates) is probably false.

## No comments:

## Post a Comment