Lab #1:
-
Center for Tropical Ecology and Biodiversity,
Tunghai University & Fushan Botanical Garden
http://www.nku.edu/~longa/cgi-bin/cgi-tcl-examples/generic/SA/SA.cgi
#############################################
# THE AUTOCORRELATION GAME #
#############################################
# to start: run the code below #
############################################# rm(list = ls()) library(spdep) s<-function(i,j){ a<-dat[i] b<-dat[j] dat[i]<-b dat[j]<-a return(dat)
} plot(0:1,0:1,,xaxt="n",yaxt="n",bty="n",xlab="",ylab="
",type="n") segments(seq(0,1,by=0.25),rep(0,5),seq(0,1,by=0.25),re p(1,5)) segments(rep(0,5),seq(0,1,by=0.25),rep(1,5),seq(0,1,by
=0.25)) text(x,y,round(dat,2),cex=2) text(x+0.10,y+0.10,1:16,col="grey",cex=0.8) neigh<-cell2nb(4, 4, type="rook", torus=FALSE) listw<-nb2listw(neigh, style="B") title(main=paste("Moran
I=",round(moran.test(dat,listw)$estimate[1],3))) plot.new<-function(dat){ plot(0:1,0:1,,xaxt="n",yaxt="n",bty="n",xlab="",ylab=
"",type="n") segments(seq(0,1,by=0.25),rep(0,5),seq(0,1,by=0.25),r ep(1,5)) segments(rep(0,5),seq(0,1,by=0.25),rep(1,5),seq(0,1,b y=0.25)) text(x,y,round(dat,2),cex=2) text(x+0.10,y+0.10,1:16,col="grey",cex=0.8) title(main=paste("Moran
I=",round(moran.test(dat,listw)$estimate[1],3)))
} x<-rep(seq(0.25/2,0.75+0.25/2,by=.25),4) y<-rep(seq(0.25/2,0.75+0.25/2,by=.25),each=4) dat<-runif(16,0,10)
######################################################
# THE AUTOCORRELATION GAME #
######################################################
# how to play: #
# 1. to start: run the code above a 4-by-4 lattice #
# will appear the values in black are simulate #
# from a uniform distribution with range (0,10) #
# and rounded up to 1/100 cells are labelled by #
# the grey number on the top-right corner #
# Moran I gives the value of the autocorrelation #
# index for the current data on the lattice #
# 2. to switch the values in cells i and j: #
# type "dat<-s(i,j)" #
# 3. to see the result: type "plot.new(dat)" #
# 4. have fun #
###################################################### dat<-s(2,15) plot.new(dat)
• file paths
• INPUT statements (e.g., list of variables)
• n, n-1 (Puerto Rico: 73, 72)
• variables in KEEP= statements
• variables in outfile ( DATA _NULL_ ) PUT statements (e.g., list of variables)
• TRY definition
max
FILENAME CONN 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-CON.TXT';
FILENAME OUTFILE1 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-EIGENVECTORS.TXT';
FILENAME OUTFILE2 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-MC-EIG.TXT';
FILENAME OUTFILE3 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-EIG.TXT';
*******************************************
* CHANGES NEED TO BE MADE FOR N AND (N-1) *
******************************************;
TITLE 'EIGENFUNCTIONS FOR THE PR DATA';
*****************************************************************************************************************
* READ IN THE N-BY-N GEOGRAPHIC CONNECTIVITY MATRIX; CODE THE DIAGONAL WITH 1S, MAKE SURE NOT TO MISREAD THE ID *
*****************************************************************************************************************;
DATA STEP1; INFILE CONN; INPUT IDC C1-C73; DROP IDC; RUN;
DATA STEP2;
ARRAY M{73} M1-M73;
DO I=1 TO 73;
DO J=1 TO 73;
M{J} = -1/73;
IF I=J THEN M{J}=1-1/73;
END;
OUTPUT;
END;
DROP I J;
RUN;
PROC IML;
USE STEP1; READ ALL INTO C;
USE STEP2; READ ALL INTO M;
IDEN=I(73);
CM=C*M;
MCM=M*CM;
ONE=J(73,1);
CSUM=C*ONE;
CALL EIGEN(EVALS,EVECS,MCM);
CREATE STEP3 FROM EVALS;
APPEND FROM EVALS;
CREATE STEP4 FROM EVECS;
APPEND FROM EVECS;
CREATE STEP5 FROM CSUM;
APPEND FROM CSUM;
QUIT;
DATA STEP3(REPLACE=YES); SET STEP3; EVALS=COL1; DROP COL1; RUN;
DATA STEP4(REPLACE=YES); SET STEP4;
ARRAY EVECS{73} E1-E73;
ARRAY IMLOUT{73} COL1-COL73;
DO I=1 TO 73;
DROP COL1-COL73 I;
X0=1;
RUN;
EVECS{I} = IMLOUT{I};
END;
DATA STEP5(REPLACE=YES); SET STEP5; CSUM=COL1; DROP COL1; RUN;
DATA COMBINE; SET STEP3; SET STEP4; SET STEP5; RUN;
PROC MEANS DATA=COMBINE NOPRINT; VAR CSUM X0; OUTPUT OUT=CSUM SUM=CSUM N; RUN;
PROC MEANS DATA=COMBINE NOPRINT; VAR EVALS; OUTPUT OUT=LAM1 MAX=LAM1; RUN;
DATA MAXMC; SET CSUM(KEEP=CSUM N); SET LAM1(KEEP=LAM1);
MCMAX = N*LAM1/CSUM;
RUN;
PROC PRINT; RUN;
DATA _NULL_;
SET STEP4;
FILE OUTFILE1 LRECL=2048;
ID=_N_;
PUT ID E1-E73;
RUN; writing to output files
DATA _NULL_;
SET STEP3; IF _N_=1 THEN SET CSUM; IF _N_=1 THEN SET LAM1;
FILE OUTFILE2;
ID=_N_;
MCO=N*EVALS/CSUM;
MCADJ=EVALS/LAM1;
PUT ID EVALS MCO MCADJ;
RUN;
DATA STEP1(REPLACE=YES); SET STEP1;
ARRAY CONN{73} C1-C73;
CSUM=0;
DO I=1 TO 73;
IF _N_=I THEN CONN{I} = 1;
CSUM=CSUM + CONN{I};
END;
IDC=_N_;
X0=1;
CSUM=CSUM-1;
DROP I;
RUN;
PROC MEANS DATA=STEP1 NOPRINT; VAR X0 CSUM; OUTPUT OUT=APP SUM=N CSUM; RUN;
PROC PRINCOMP DATA=STEP1(TYPE=CORR) OUTSTAT=EVECS NOPRINT; VAR C1-C73; RUN;
DATA EVECSTMP; SET EVECS; IF _N_=1 THEN SET APP(KEEP=CSUM N);
IF _TYPE_='EIGENVAL' THEN IKEEP=1; ELSE IKEEP=0;
IF IKEEP=0 THEN DELETE;
EVAL1=C2-1;
MCMAXAPP = N*EVAL1/CSUM;
DROP _TYPE_ _NAME_ IKEEP C1-C73;
RUN;
PROC PRINT; RUN;
DATA EVALS; SET EVECS; IF _TYPE_="EIGENVAL" THEN IKEEP=1; ELSE IKEEP=0; IF IKEEP=0 THEN DELETE; RUN;
PROC TRANSPOSE DATA=EVALS OUT=APPEVALS PREFIX=EVALS; VAR C1-C73; RUN;
DATA EIGEN; SET APPEVALS(KEEP=EVALS1); LAMBDAC=EVALS1-1; DROP EVALS1; RUN;
DATA APPEVALS(REPLACE=YES); SET APPEVALS; EVALS=EVALS1-1; IF _NAME_="C1" THEN DELETE; DROP _NAME_ EVALS1; RUN;
PROC TRANSPOSE DATA=STEP1 OUT=TCSUM PREFIX=TCSUM; VAR CSUM; RUN;
DATA MATRIXW; SET STEP1; IF _N_=1 THEN SET TCSUM; _TYPE_="CORR";
ARRAY TCSUM{73} TCSUM1-TCSUM73;
ARRAY CONN{73} C1-C73;
DO I=1 TO 73;
CONN{I}=CONN{I}/(SQRT(TCSUM{I})*SQRT(CSUM));
IF _N_=I THEN CONN{I} = 1;
END;
IDC=_N_;
DROP TCSUM1-TCSUM73 I CSUM _NAME_;
RUN;
PROC PRINCOMP DATA=MATRIXW(TYPE=CORR) OUTSTAT=WEVECS NOPRINT; VAR C1-C73; RUN;
DATA WEVALSTMP; SET WEVECS; IF _N_=1 THEN SET APP(KEEP=CSUM N);
IF _TYPE_='EIGENVAL' THEN IKEEP=1; ELSE IKEEP=0;
IF IKEEP=0 THEN DELETE;
DROP _TYPE_ _NAME_ IKEEP;
RUN;
PROC TRANSPOSE DATA=WEVALSTMP OUT=WEVALS PREFIX=WEVALS; VAR C1-C73; RUN;
DATA EIGEN(REPLACE=YES); SET EIGEN; SET WEVALS(KEEP=WEVALS1); LAMBDAW=WEVALS1-1; DROP WEVALS1; RUN;
PROC MEANS; VAR LAMBDAW; RUN;
DATA _NULL_;
SET EIGEN;
FILE OUTFILE3;
ID=_N_;
PUT ID LAMBDAC LAMBDAW;
RUN;
****************************************** writing to output file for SAR
* ADJUSTING THE APPROXIMATE EIGENVECTORS *
******************************************;
DATA EVECS(REPLACE=YES); SET EVECS;
IF _TYPE_='SCORE' THEN IKEEP=1; ELSE IKEEP=0;
IF IKEEP=0 THEN DELETE;
RUN;
PROC TRANSPOSE OUT=STEP6 PREFIX=C; VAR C1-C73; RUN;
DATA STEP6(REPLACE=YES); SET STEP6; SET STEP1(KEEP=IDC); RUN;
PROC FACTOR DATA=STEP6 N=72 ROTATE=QUARTIMAX OUT=FINAL OUTSTAT=FINALE NOPRINT; VAR C2-C73; RUN;
/*
DATA _NULL_;
SET FINAL;
FILE OUTFILE1 LRECL=2048;
ID=_N_;
PUT ID FACTOR1-FACTOR72;
RUN;
DATA _NULL_;
SET APPEVALS; IF _N_=1 THEN SET CSUM; IF _N_=1 THEN SET EVECSTMP;
FILE OUTFILE2;
ID=_N_;
MCO=N*EVALS/CSUM;
MCADJ=EVALS/EVAL1; writing approximations
PUT ID EVALS MCO MCADJ; to output files
RUN;
*/
From OLS theory: b = ( X T X ) -1 X T Y
• Convert the attribute variable in question to zscores
•
• Let X = z b numerator
Y
and Y = Cz
Y
T
1 T
( z
Y z
Y
) z
Y
Cz
Y
• Regress Cz
Y on z
Y
, with a no-intercept option
•
• Let X = 1 and Y = C1 b denominato r
( 1
T
1 )
1
1
T
C1
• Regress C1 on 1 , with a no-intercept option
• MC = b numerator
/b denominator
GR
n
1
2 1
T
C1
2 i n
2 n
1 i
1 c ij i n
1 z
2 y, i
n
1
MC n squared deviations row sums of matrix C
The GR contains all of the locational information captured by the MC, but emphasizes any marked deviations (e.g., outliers).
FILENAME INDATA ‘D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
FILENAME CONN 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-CON.TXT';
TITLE 'MC AND GR FOR THE PR AREA DATA';
**************************************************************
* READ IN THE N-BY-N GEOGRAPHIC CONNECTIVITY MATRIX; *
* CODE THE DIAGONAL WITH 1S, MAKE SURE NOT TO MISREAD THE ID *
**************************************************************;
DATA STEP1;
INFILE CONN;
INPUT IDC C1-C73;
ARRAY CONN{73} C1-C73;
CSUM = 0;
DO I=1 TO 73;
CSUM = CSUM + CONN{I};
END;
RUN;
PROC MEANS DATA=STEP1 NOPRINT; VAR CSUM; OUTPUT OUT=CSUM SUM=SUMCSUM; RUN;
************************************************
* READ IN GEOREFERENCED DATA; *
* THEN CENTER THE SELEDTED ATTRIBUTE VARIABLE *
************************************************;
DATA STEP2;
INFILE INDATA LRECL=1024;
INPUT ID AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
Y = LOG(AREAM + 0.001);
* Y = TRMPT_RATIO;
X0=1;
RUN;
PROC STANDARD MEAN=0 STD=1 OUT=STEP2(REPLACE=YES); VAR Y; RUN;
PROC MEANS NOPRINT; VAR X0; OUTPUT OUT=STEP0 SUM=N; RUN;
DATA STEP0(REPLACE=YES);
SET STEP0;
SET CSUM;
APPSEMC = SQRT(2/SUMCSUM);
RUN;
DATA STEP2(REPLACE=YES);
SET STEP2; SET STEP1;
ARRAY CONN{73} C1-C73;
ARRAY YCONN{73} CY1-CY73;
DO I=1 TO 73;
YCONN{I} = Y*CONN{I};
END;
Y2CSUM=(Y**2)*CSUM;
RUN;
PROC MEANS NOPRINT;
VAR CY1-CY73;
OUTPUT OUT=CYOUT1 SUM=CY1-CY73;
RUN;
PROC TRANSPOSE DATA=CYOUT1 PREFIX=CY OUT=CYOUT2;
VAR CY1-CY73;
RUN;
DATA STEP3;
SET STEP2(KEEP=X0 Y2CSUM Y CSUM);
SET CYOUT2(KEEP=CY1);
CZY=CY1;
ZY=Y;
DROP CY1;
X0GR=X0;
RUN;
PROC REG OUTEST=OUTNMCB; MODEL CZY=ZY/NOINT; OUTPUT OUT=MCPRED P=CZYHAT; RUN;
AND
*********************************
* *
* CONSTRUCTING A MC SCATTERPLOT *
* *
*********************************;
PROC GPLOT; PLOT CZY*ZY CZYHAT*ZY/OVERLAY; RUN;
PROC REG DATA=STEP3 OUTEST=OUTDMCB; MODEL CSUM=X0/NOINT; RUN;
PROC REG DATA=STEP3 OUTEST=OUTNGRB NOPRINT; MODEL Y2CSUM=X0GR/NOINT; RUN;
DATA STEP0(REPLACE=YES);
SET STEP0;
SET OUTNMCB(KEEP=ZY);
SET OUTDMCB(KEEP=X0);
SET OUTNGRB(KEEP=X0GR);
MC = ZY/X0;
EMC = -1/(N-1);
GR = X0GR/X0 - ((N-1)/N)*MC;
IF MC> 0 THEN PROBMC=1 - PROBNORM((MC-EMC)/APPSEMC);
ELSE PROBMC= PROBNORM((MC-EMC)/APPSEMC);
DROP ZY X0;
RUN;
PROC PRINT; VAR N GR MC EMC APPSEMC PROBMC; RUN;
FILENAME INDATA1 ‘D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
FILENAME INDATA2 ‘D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-DEM&QUAD-DATA.TXT';
TITLE 'SEMIVARIOGRAM CONSTRUCTION FOR THE PR DATA';
************************************************
* READ IN GEOREFERENCED DATA; *
* THEN CENTER THE SELEDTED ATTRIBUTE VARIABLE *
************************************************;
DATA STEP1; INFILE INDATA1 LRECL=1024;
INPUT ID AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
* Y = LOG(AREAM + 0.001);
* Y = TRMPT_RATIO;
RUN;
PROC SORT DATA=STEP1 OUT=STEP1(REPLACE=YES); BY NAME; RUN;
DATA STEP2; INFILE INDATA2 LRECL=1024;
INPUT IDDEM MELEV SELEV U V QUAD NAME$;
U=U/1000; V=V/1000;
* Y=LOG(MELEV+17.5);
Y=(SELEV-25)**0.5;
RUN;
PROC SORT DATA=STEP2 OUT=STEP2(REPLACE=YES); BY NAME; RUN;
DATA STEP3; MERGE STEP1 STEP2; BY NAME; RUN;
PROC VARIOGRAM DATA=STEP3 OUTP=PAIRS;
COMPUTE NOVARIOGRAM;
COORDINATES XC=U YC=V;
VAR Y;
RUN;
DATA PAIRS(REPLACE=YES); SET PAIRS; GAMMA=(V1-V2)**2; DROP X1 Y1 X2 Y2 COS VARNAME;
RUN;
PROC MEANS MAX; VAR DISTANCE; RUN;
PROC VARIOGRAM DATA=STEP3 OUTVAR=GAMMA;
COMPUTE LAGD=3 MAXLAGS=50;
COORDINATES XC=U YC=V;
VAR Y;
RUN;
DATA GAMMA(REPLACE=YES); SET GAMMA;
IF LAG =0 THEN DELETE;
IF LAG=-1 THEN LAG=0; IF LAG=0 THEN DISTANCE=0; IF LAG=0 THEN VARIOG=0; RUN;
PROC PRINT; RUN;
PROC GPLOT; PLOT VARIOG*DISTANCE; RUN;
FILENAME INDATA1 ‘D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
FILENAME INDATA2 ‘D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-DEM&QUAD-DATA.TXT';
TITLE 'CONSTRUCT A SEMIVARIOGRAM FOR THE PR AREA DATA';
DATA STEP0; MAXDIST=80; RUN;
************************************************
* READ IN GEOREFERENCED DATA; *
* THEN CENTER THE SELEDTED ATTRIBUTE VARIABLE *
************************************************;
DATA STEP1A; INFILE INDATA1 LRECL=1024;
INPUT ID AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
* Y = LOG(AREAM + 0.001);
* Y = TRMPT_RATIO;
X0=1;
RUN;
PROC SORT DATA=STEP1A OUT=STEP1A(REPLACE=YES); BY NAME; RUN;
PROC MEANS NOPRINT; VAR X0; OUTPUT OUT=OUTN SUM=N; RUN;
DATA STEP1B; INFILE INDATA2 LRECL=1024;
INPUT IDDEM MELEV SELEV U V QUAD NAME$;
U=U/1000; V=V/1000; coordinates are labeled u, v
* Y=LOG(MELEV+17.5);
Y=(SELEV-25)**0.5;
RUN;
PROC SORT DATA=STEP1B OUT=STEP1B(REPLACE=YES); BY NAME; RUN;
DATA STEP1; MERGE STEP1A STEP1B; BY NAME; RUN;
PROC VARIOGRAM DATA=STEP1 OUTP=PAIRS;
COMPUTE NOVARIOGRAM;
COORDINATES XC=U YC=V;
VAR Y;
RUN;
DATA PAIRS(REPLACE=YES); SET PAIRS; GAMMA=(V1-V2)**2; DROP X1 Y1 X2 Y2 COS VARNAME;
STDDIST=DISTANCE;
DIST=DISTANCE;
COUNT=1;
RUN;
PROC MEANS SUM; VAR COUNT; RUN;
PROC MEANS MAX; VAR DISTANCE; RUN;
PROC STANDARD OUT=STEP1(REPLACE=YES) MEAN=0 STD=1; /* CREATES STANDARDIZED DISTANCE INDEX */
VAR STDDIST;
RUN;
DATA PAIRS(REPLACE=YES); SET PAIRS;
STDDIST=ROUND(STDDIST,.001); /* ROUNDS STANDARDIZED DISTANCE INDEX */
RUN;
PROC SORT DATA=PAIRS(REPLACE=YES); BY STDDIST; RUN;
PROC MEANS MEAN NOPRINT; BY STDDIST;
VAR DIST GAMMA; /* INITIAL CLUSTERING USING STANDARDIZED
DISTANCES INDEX */
OUTPUT OUT=TMEAN MEAN=MDIST MGAMMA;
RUN;
DATA STEP2; SET TMEAN;
SEQID=_N_;
DIST=MDIST; GAMMA=MGAMMA;
DROP MDIST MGAMMA;
RUN;
PROC SORT OUT=STEP2(REPLACE=YES); BY DIST; RUN;
DATA STEP2(REPLACE=YES);
SET STEP2;
SEQID = _N_;
MERGEVAL=1;
RUN;
PROC CLUSTER METHOD=WARD NOPRINT OUTTREE=TREE;
ID SEQID;
VAR DIST;
COPY GAMMA;
FREQ _FREQ_;
RUN;
**************************************************
* THE NUMBER OF GROUPS (N = ???) NEEDS TO BE SET *
**************************************************;
PROC TREE DATA=TREE NOPRINT N=75 OUT=TEMP1; ID SEQID; RUN;
PROC SORT DATA=TEMP1 OUT=TEMP1(REPLACE=YES); BY SEQID; RUN;
DATA STEP2 (REPLACE=YES);
MERGE STEP2(DROP=MERGEVAL) TEMP1;
BY SEQID;
RUN;
DATA OUTN(REPLACE=YES); SET OUTN; DIST=0.0001; STDDIST=DIST; GAMMA=0; _TYPE_=0;
_FREQ_=N; SEQID=0; CLUSTER=0; CLUSNAME='CL000'; RUN;
DATA STEP2(REPLACE=YES); SET STEP2 OUTN; RUN;
PROC SORT DATA=STEP2 OUT=STEP1(REPLACE=YES);
BY CLUSTER;
RUN;
PROC MEANS DATA=STEP1 NOPRINT; BY CLUSTER;
VAR DIST GAMMA;
FREQ _FREQ_;
OUTPUT OUT=RESULTS MIN=DISTMIN GAMMAMIN MAX=DISTMAX GAMMAMAX MEAN=DISTM GAMMAM;
RUN;
PROC SORT OUT=STEP1(REPLACE=YES); BY DISTM; RUN;
PROC PRINT; VAR GAMMAM DISTM _FREQ_; RUN;
PROC MEANS SUM; VAR _FREQ_; RUN;
PROC GPLOT DATA=RESULTS; PLOT GAMMAM*DISTM; RUN;
DATA RESULTS(REPLACE=YES);
SET RESULTS;
IF _N_=1 THEN SET STEP0;
IF DISTM > MAXDIST THEN DELETE;
RUN;
*******************************************
* *
* TREND LINES FOR THE SEMIVARIOGRAM PLOTS *
* *
*******************************************;
/*
SPHERICAL
*/
TITLE 'SPHERICAL SEMIVARIOGRAM MODEL';
PROC NLIN DATA=RESULTS NOITPRINT MAXITER=500 METHOD=MARQUARDT;
PARMS C0=0.01 C1=1 R=10;
BOUNDS C0>0, C1>0, O<R;
IF DISTM<=R THEN
DO;
MODEL GAMMAM = C0 + C1*((3/2)*(DISTM/R) - (1/2)*(DISTM/R)**3);
_WEIGHT_ = _FREQ_;
END;
ELSE
DO;
MODEL GAMMAM = C0 + C1;
_WEIGHT_ = _FREQ_;
END;
OUTPUT OUT=TEMP1 P=GAMMAMHAT;
RUN;
PROC GPLOT; PLOT GAMMAM*DISTM GAMMAMHAT*DISTM/OVERLAY; RUN;
/*
GAUSSIAN
*/
.
.
.
/*
BESSEL
*/
->
->
->
->
->
->
->
What you learned in today’s lab:
1. Change path names, etc. (Slide #4 —print for a ready reference)
2. Compute eigenfunctions
3. Compute MC and GR
4. Construct Moran scatterplots and semivariogram plots
5. Fit trend lines to Moran scatterplots and semi-variogram plots
6. Implement kriging in ArcGIS