Lab #2: Spatial statistics in practice - variable transformations - LISA statistics

advertisement
Lab #2:
- variable transformations
- LISA statistics
- Getis-Ord statistics
- constructing spider
diagrams
Spatial statistics in practice
Center for Tropical Ecology and Biodiversity,
Tunghai University & Fushan Botanical Garden
SAS code for identifying a Box-Cox type of
response variable transformation
FILENAME INDATA 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
TITLE 'SW-BASED TRANSFORMATIONS FOR THE PR AREA DATA';
************************************************
* READ IN GEOREFERENCED DATA;
*
* THEN CENTER THE SELEDTED ATTRIBUTE VARIABLE *
************************************************;
DATA STEP1;
INFILE INDATA LRECL=1024;
INPUT ID AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
Y = AREAM;
X0=1;
RUN;
PROC UNIVARIATE NORMAL; VAR Y; RUN;
PROC UNIVARIATE NOPRINT; VAR Y; OUTPUT OUT=TEMP MEDIAN=XM MAX=YMAX MIN=YMIN; RUN;
DATA TEMP(REPLACE=YES);
SET TEMP;
IF YMIN>0 THEN DEC = YMAX/YMIN; ELSE DEM=YMAX;
RUN;
PROC PRINT; VAR DEC YMAX YMIN XM; RUN;
PROC RANK DATA=STEP1 OUT=STEP1 (REPLACE=YES) NORMAL=BLOM; VAR Y; RANKS NY; RUN;
PROC RANK DATA=STEP1 OUT=STEP1 (REPLACE=YES); VAR Y; RANKS RY; RUN;
DATA STEP1 (REPLACE=YES);
SET STEP1;
IF _N_=1 THEN SET TEMP(KEEP=YMIN YMAX);
Y=Y-YMIN;
RUN;
****************************************************
* A JACOBIAN TERM MAY BE NEEDED HERE IF N IS SMALL *
****************************************************
* LOG TRANSFORMATION (POWER OF 0) WITH TRANSLATION *
****************************************************;
PROC NLIN DATA=STEP1 NOITPRINT MAXITER=5000 METHOD=MARQUARDT;
PARMS A=0 B=1 D=0.01;
BOUNDS 0<D;
MODEL NY = A + B*LOG(Y+D);
OUTPUT OUT=TEMP1 PARMS=A B D;
RUN;
DATA TEMP1(REPLACE=YES); SET TEMP1;
D=D-YMIN;
RUN;
PROC MEANS; VAR D; RUN;
******************************************************
* POWER TRANSFORMATION (POWER <> 0) WITH TRANSLATION *
******************************************************;
PROC NLIN DATA=STEP1 NOITPRINT MAXITER=1000 METHOD=MARQUARDT;
PARMS A=0 B=1 C=0.5; D=1.0E-10;
BOUNDS 0<D;
MODEL NY = A + B*((Y + D)**C - 1)/C;
OUTPUT OUT=TEMP2 PARMS=A B D;
RUN;
DATA TEMP2(REPLACE=YES); SET TEMP2;
D=D-YMIN;
RUN;
PROC MEANS; VAR C D; RUN;
******************************
* EXPONENTIAL TRANSFORMATION *
******************************;
PROC NLIN DATA=STEP1 NOITPRINT MAXITER=5000 METHOD=MARQUARDT;
PARMS A=0 B=1 C=0.01;
MODEL NY = A + B*EXP(-C*(Y+YMIN) );
OUTPUT OUT=TEMP PARMS=A B C;
RUN;
PROC MEANS; VAR C; RUN;
exponent of 0
exponent of < 0, > 0
exponentiation (Manley transformation)
Transformation possibilities example
SAS code for normality & variance equality tests
FILENAME INDATA 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
OPTIONS LINESIZE=72;
TITLE 'SW-BASED TRANSFORMATIONS FOR THE PR AREA DATA';
************************************************
* READ IN GEOREFERENCED DATA;
*
* THEN CENTER THE SELEDTED ATTRIBUTE VARIABLE *
************************************************;
DATA STEP1;
INFILE INDATA LRECL=1024;
INPUT ID AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
Y = MCT_RATIO;
TRY = LOG(Y – 0.20);
X0=1;
RUN;
PROC UNIVARIATE NORMAL; VAR Y TRY; RUN;
PROC MEANS DATA=STEP1 NOPRINT; VAR Y; OUTPUT OUT=YMEAN MEAN=YMEAN; RUN;
DATA STEP1(REPLACE=YES); SET STEP1; IF _N_=1 THEN SET YMEAN(KEEP=YMEAN);
IF Y>YMEAN THEN IYMEAN=1; ELSE IYMEAN=0;
RUN;
PROC GLM; CLASS IYMEAN; MODEL Y=IYMEAN; MEANS IYMEAN/HOVTEST=BARTLETT; RUN;
PROC GLM; CLASS IYMEAN; MODEL Y=IYMEAN; MEANS IYMEAN/HOVTEST=LEVENE; RUN;
PROC MEANS DATA=STEP1 NOPRINT; VAR TRY; OUTPUT OUT=TRYMEAN MEAN=TRYMEAN; RUN;
DATA STEP1(REPLACE=YES); SET STEP1; IF _N_=1 THEN SET TRYMEAN(KEEP=TRYMEAN);
IF TRY>TRYMEAN THEN ITRYMEAN=1; ELSE ITRYMEAN=0;
RUN;
PROC GLM; CLASS ITRYMEAN; MODEL TRY=ITRYMEAN; MEANS ITRYMEAN/HOVTEST=BARTLETT; RUN;
PROC GLM; CLASS ITRYMEAN; MODEL TRY=ITRYMEAN; MEANS ITRYMEAN/HOVTEST=LEVENE; RUN;
SAS code for Gi, LISA and zLISA
FILENAME INDATA1 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-AREAS-COMPETITION.TXT';
FILENAME INDATA2 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-DEM&QUAD-DATA.TXT';
FILENAME CONN 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-CON.TXT';
FILENAME OUTFILE 'D:\JYU-SUMMERSCHOOL2006\LAB#2\PR-LISA-OUT.TXT';
OPTIONS LINESIZE=72;
TITLE 'GI AND LISA FOR THE PR AREA DATA';
**************************************************************************************
* READ IN THE N-BY-N GEOGRAPHIC CONNECTIVITY MATRIX; MAKE SURE NOT TO MISREAD THE ID *
**************************************************************************************;
DATA STEP1; INFILE CONN; INPUT IDC C1-C73; RUN;
****************************************************************************
* READ IN GEOREFERENCED DATA; THEN CENTER THE SELECTED ATTRIBUTE VARIABLE *
****************************************************************************;
DATA STEP2A; INFILE INDATA1 LRECL=1024;
INPUT POLYGON AREAM AREACT MCT_RATIO TRMCT_RATIO AREAPT MPT_RATIO TRMPT_RATIO NAME$;
* Y = AREAM;
* Y = MCT_RATIO;
X0=1;
RUN;
PROC SORT DATA=STEP2A OUT=STEP2A(REPLACE=YES); BY NAME; RUN;
DATA STEP2B; INFILE INDATA2 LRECL=1024; INPUT IDDEM MELEV SELEV U V QUAD NAME$;
U=U/1000; V=V/1000;
Y=LOG(MELEV+17.5);
* Y=(SELEV-25)**0.5;
YO=Y;
RUN;
PROC SORT DATA=STEP2B OUT=STEP2B(REPLACE=YES); BY NAME; RUN;
DATA STEP2; MERGE STEP2A STEP2B; BY NAME; RUN;
PROC
PROC
PROC
DATA
STANDARD MEAN=0 STD=1 OUT=STEP2(REPLACE=YES); VAR Y; RUN;
MEANS DATA=STEP2 NOPRINT; VAR X0; OUTPUT OUT=NOUT SUM=N; RUN;
MEANS DATA=STEP2 NOPRINT; VAR YO; OUTPUT OUT=YOOUT MEAN=YOBAR USS=YOUSS; RUN;
STEP2(REPLACE=YES); SET STEP2; SET STEP1;
ARRAY CONN{73} C1-C73;
ARRAY YCONN{73} CY1-CY73;
ARRAY YOCONN{73} CYO1-CYO73;
CSUM = 0;
DO I=1 TO 73;
CSUM = CSUM + CONN{I};
IF _N_=I THEN CONN{I}=0;
YCONN{I} = Y*CONN{I};
YOCONN{I} = YO*CONN{I};
END;
RUN;
PROC
PROC
PROC
PROC
DATA
MEANS DATA=STEP2 NOPRINT; VAR CY1-CY73; OUTPUT OUT=CYOUT1 SUM=CY1-CY73; RUN;
TRANSPOSE DATA=CYOUT1 PREFIX=CY OUT=CYOUT2; VAR CY1-CY73; RUN;
MEANS DATA=STEP2 NOPRINT; VAR CYO1-CYO73; OUTPUT OUT=CYOOUT1 SUM=CYO1-CYO73; RUN;
TRANSPOSE DATA=CYOOUT1 PREFIX=CYO OUT=CYOOUT2; VAR CYO1-CYO73; RUN;
STEP3;
SET STEP2(KEEP=POLYGON Y YO CSUM);
SET CYOUT2(KEEP=CY1);
SET CYOOUT2(KEEP=CYO1);
IF _N_=1 THEN SET NOUT(KEEP=N);
would be (n-2)
IF _N_=1 THEN SET YOOUT(KEEP=YOBAR YOUSS);
for an unbiased
LISA=Y*CY1/((N-1)/N);
estimate
ELISA=-CSUM/(N-1);
GI=(CYO1-((N*YOBAR-YO)/(N-1))*CSUM)/
( SQRT((YOUSS-YO**2-((N*YOBAR-YO)**2)/(N-1))/(N-1))*SQRT(((N-1)*CSUM-CSUM**2)/(N-2)) );
DROP CY1;
RUN;
PROC PRINT; VAR LISA GI; RUN;
*****************************
*
*
* CONDITIONAL RANDOMIZATION *
*
*
*****************************;
PROC TRANSPOSE DATA=STEP3 PREFIX=N OUT=SAMPLE1; VAR CSUM; RUN;
PROC TRANSPOSE DATA=STEP3 PREFIX=Y OUT=SAMPLE2; VAR Y; RUN;
DATA SAMPLE3; SET SAMPLE1; SET SAMPLE2;
ARRAY TY{73} Y1-Y73;
ARRAY NI{73} N1-N73;
DO I=1 TO 73;
reduce to 100 for class purposes
DO J=1 TO 10000;
DO K=1 TO 73;
YI=TY{I}; YJ=TY{K};
PROB = RANUNI(0);
IF K=I THEN PROB=0;
NUMI=NI{I};
OUTPUT;
END;
END;
END;
DROP K Y1-Y73 N1-N73;
RUN;
PROC RANK DATA=SAMPLE3 OUT=SAMPLE3 (REPLACE=YES) DESCENDING; VAR PROB; RANKS RPROB; BY
I J; RUN;
DATA SAMPLE3(REPLACE=YES); SET SAMPLE3; IF _N_=1 THEN SET NOUT(KEEP=N); IF RPROB >
NUMI THEN DELETE;
YIYJ=YI*YJ/((N-1)/N);
RUN;
PROC MEANS NOPRINT; VAR YIYJ; BY I J; OUTPUT OUT=SAMPLE4 SUM=CYIYJ; RUN;
PROC MEANS NOPRINT; VAR CYIYJ; BY I; OUTPUT OUT=SAMPLE5 MEAN=MCYIYJ STD=SCYIYJ; RUN;
DATA FINAL; SET STEP3(KEEP=POLYGON LISA ELISA N); SET SAMPLE5(KEEP=MCYIYJ SCYIYJ);
ZLISA = (LISA-MCYIYJ)/SCYIYJ;
IF ZLISA> 0 THEN PROBLISA = 1 - PROBNORM(ZLISA);
ELSE PROBLISA =
PROBNORM(ZLISA);
BON1
= 0.05/N; BON2 = 1 - 0.01/N;
SIDAK1 = 1 - (1 - 0.05)**(1/N); SIDAK2 = (1 - 0.05)**(1/N);
IF PROBLISA < BON1 OR PROBLISA < SIDAK1 OR PROBLISA > BON2 OR PROBLISA > SIDAK2 THEN
IEXTREME=1; ELSE IEXTREME=0;
RUN;
PROC PRINT; VAR POLYGON LISA ZLISA PROBLISA IEXTREME BON1 BON2 SIDAK1 SIDAK2; RUN;
DATA EXTREMES; SET FINAL; IF IEXTREME=0 THEN DELETE; RUN;
PROC PRINT; VAR POLYGON LISA ZLISA PROBLISA IEXTREME BON1 BON2 SIDAK1 SIDAK2; RUN;
PROC CLUSTER DATA=FINAL NOPRINT SIMPLE METHOD=COMPLETE NOEIGEN OUTTREE=LISATREE;
VAR ZLISA;
ID POLYGON;
RUN;
****************************
* SET THE NUMBER OF GROUPS *
****************************;
PROC TREE DATA=LISATREE OUT=TREEOUT NCLUSTERS=5; ID POLYGON; RUN;
PROC SORT OUT=TREEOUT; BY POLYGON; RUN;
DATA
PROC
PROC
PROC
PROC
PROC
PROC
PROC
FINAL(REPLACE=YES); MERGE FINAL TREEOUT; BY POLYGON; RUN;
SORT OUT=FINAL(REPLACE=YES); BY CLUSTER ZLISA; RUN;
ANOVA; CLASS CLUSTER; MODEL ZLISA=CLUSTER; RUN;
MEANS DATA=FINAL NOPRINT; VAR ZLISA; BY CLUSTER; OUTPUT OUT=TEMP MIN=ZMIN MAX=ZMAX; RUN;
PRINT; RUN;
UNIVARIATE DATA=FINAL NOPRINT; VAR ZLISA; BY CLUSTER; OUTPUT OUT=TEMP2 PROBN=SW; RUN;
PRINT; RUN;
DISCRIM DATA=FINAL POOL=TEST; CLASS CLUSTER; VAR ZLISA; RUN;
DATA _NULL_; SET FINAL; FILE OUTFILE;
PUT CLUSTER POLYGON ZLISA;
RUN;
SAS code for Moran scatterplot regression diagnostics
FILENAME INDATA 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-DEM&QUAD-DATA.TXT';
FILENAME CONN 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-CON.TXT';
FILENAME OUTFILE 'D:\JYU-SUMMERSCHOOL2006\LAB#2\PR-REGRESSION-DIAGNOSTICS.TXT';
TITLE 'MORAN SCATTERPLOT REGRESSION DIAGNOSTICS FOR THE PR DATA';
**************************************************************
* READ IN THE N-BY-N GEOGRAPHIC CONNECTIVITY MATRIX;
*
**************************************************************;
DATA STEP1;
INFILE CONN LRECL=512;
INPUT IDC C1-C73;
ARRAY CONN{73} C1-C73;
CSUM = 0;
DO I=1 TO 73;
CSUM = CSUM + CONN{I};
END;
RUN;
PROC MEANS DATA=STEP1 NOPRINT; VAR CSUM; OUTPUT OUT=CSUM SUM=SUMCSUM; RUN;
PROC PRINT; RUN;
****************************************************************************
* READ IN GEOREFERENCED DATA; THEN CENTER THE SELECTED ATTRIBUTE VARIABLE *
****************************************************************************;
DATA STEP2;
INFILE INDATA;
IDDEM is
INPUT IDDEM MELEV SELEV U V QUAD NAME$;
U=U/1000; V=V/1000;
Y=LOG(MELEV+17.5);
* Y=(SELEV-25)**0.5;
X0=1;
RUN;
PROC SORT OUT=STEP2(REPLACE=YES); BY IDDEM; RUN;
PROC STANDARD MEAN=0 STD=1 OUT=STEP2(REPLACE=YES); VAR Y; RUN;
PROC MEANS NOPRINT; VAR X0; OUTPUT OUT=OUTN SUM=N; RUN;
the unique
id
DATA STEP2(REPLACE=YES); SET STEP2; SET STEP1;
ARRAY CONN{73} C1-C73;
ARRAY YCONN{73} CY1-CY73;
DO I=1 TO 73;
YCONN{I} = Y*CONN{I};
END;
RUN;
PROC MEANS NOPRINT; VAR CY1-CY73; OUTPUT OUT=CYOUT1 SUM=CY1-CY73; RUN;
PROC TRANSPOSE DATA=CYOUT1 PREFIX=CY OUT=CYOUT2; VAR CY1-CY73; RUN;
DATA STEP3; SET STEP2(KEEP=X0 Y CSUM); SET CYOUT2(KEEP=CY1);
CZY=CY1;
ZY=Y;
RUN;
PROC REG OUTEST=OUTREG; MODEL CZY=ZY/NOINT PRESS; OUTPUT OUT=MCPRED P=CZYHAT H=HI
RSTUDENT=RSTUDENTI DFFITS=DFFITSI; RUN;
DATA OUTREG(REPLACE=YES); SET OUTREG; SET OUTN(KEEP=N);
RMSEPRESS=SQRT(_PRESS_/N);
RUN;
PROC PRINT DATA=OUTREG; VAR _RMSE_ RMSEPRESS; RUN;
DATA _NULL_;
SET MCPRED; SET STEP2(KEEP=IDDEM NAME);
FILE OUTFILE;
PUT IDDEM HI RSTUDENTI DFFITSI NAME;
RUN;
DATA MCPRED(REPLACE=YES); SET MCPRED; IF _N_=1 THEN SET OUTN(KEEP=N);
IF HI<2*1/N THEN HI='.';
IF ABS(RSTUDENTI) < 2 THEN RSTUDENTI='.';
IF DFFITSI < 2/SQRT(1/N) THEN DFFITSI='.';
RUN;
DATA CHECK; SET MCPRED; SET STEP2(KEEP=IDDEM NAME);
IF HI='.' AND RSTUDENTI='.' AND DFFITSI='.' THEN DELETE;
RUN;
PROC PRINT DATA=CHECK; VAR IDDEM HI RSTUDENTI DFFITSI NAME; RUN;
SAS code for local statistics eigenvector covariates
FILENAME EVECS 'D:\JYU-SUMMERSCHOOL2006\LAB#1\PR-EIGENVECTORS.TXT';
FILENAME LISA 'D:\JYU-SUMMERSCHOOL2006\LAB#2\PR-LISA-OUT.TXT';
FILENAME REGD 'D:\JYU-SUMMERSCHOOL2006\LAB#2\PR-REGRESSION-DIAGNOSTICS.TXT';
FILENAME OUTFILE 'D:\JYU-SUMMERSCHOOL2006\LAB#2\PR-SELECTED-EVECS.TXT';
TITLE 'SPATIAL AUTOCORRELATION IN PR MORAN SCATTERPLOT DIAGNOSTICS';
******************************************************
* READ IN EIGENVECTORS, LISA, REGRESSION DIAGNOSTICS *
******************************************************;
DATA STEP1; INFILE EVECS LRECL=1024; INPUT IDE E1-E73; RUN;
PROC SORT OUT=STEP1(REPLACE=YES); BY IDE; RUN;
DATA STEP2; INFILE LISA; INPUT CLUSTER IDL ZLISA; RUN;
PROC SORT OUT=STEP2(REPLACE=YES); BY IDL; RUN;
DATA STEP3; INFILE REGD; INPUT IDR HI RSTUDENTI DFFITSI; RUN;
PROC SORT OUT=STEP3(REPLACE=YES); BY IDR; RUN;
DATA FINAL; SET STEP1; SET STEP2; SET STEP3; RUN;
PROC PRINT; VAR IDE IDL IDR; RUN;
PROC REG; MODEL ZLISA
= E1-E73/SELECTION=STEPWISE SLE=0.01; OUTPUT
P=ZLISAHAT; RUN;
PROC GPLOT; PLOT ZLISA*ZLISAHAT ZLISA*ZLISA/OVERLAY; RUN;
PROC REG; MODEL HI
= E1-E73/SELECTION=STEPWISE SLE=0.01; OUTPUT
P=HIHAT; RUN;
PROC GPLOT; PLOT HI*HIHAT HI*HI/OVERLAY; RUN;
PROC REG; MODEL RSTUDENTI = E1-E73/SELECTION=STEPWISE SLE=0.01; OUTPUT
P=RSTHAT; RUN;
PROC GPLOT; PLOT RSTUDENTI*RSTHAT RSTUDENTI*RSTUDENTI/OVERLAY; RUN;
PROC REG; MODEL DFFITSI
= E1-E73/SELECTION=STEPWISE SLE=0.01; OUTPUT
P=DFFITSHAT; RUN;
PROC GPLOT; PLOT DFFITSI*DFFITSHAT DFFITSI*DFFITSI/OVERLAY; RUN;
DATA _NULL_; SET FINAL(KEEP=IDE E2); FILE OUTFILE; PUT IDE E2; RUN;
OUT=OUTLISA
OUT=OUTHI
OUT=OUTRST
OUT=OUTDFFITS
ArcGIS 9.1: LISA & Getis-Ord Gi
This is to be fully functional in
ArcGIS 9.2!
Constructing spider diagrams in
ArcView
• Enable Geoprocessing extension
• Use Geoprocessing Wizard to disolve
areal units on the basis of some grouping
• Enable “Add true X,Y Centroid” extension
• Compute centroids of disolved *.shp map
• RUN spider diagram script to add “event
themes” from original map (centers) and
disolved map (points)
What you should have learned in
today’s lab:
1. Identify power transformation for variables
2. Calculate normality and variance equality
(attribute and geographic) diagnostics
3. Reconstruct the Moran scatterplots and the
semivariogram plots
4. Calculate and map Gi, LISA and regression
diagnostic statistics
5. The ability to construct spider diagrams
Download