Page tree

Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

ULTIMATE CLUSTER APPROXIMATION with stratification at the first stage

Variance and standard error for proportion of domain d in the population (general case, where weights may be different within a stratum as consequence of non-response adjustment and calibration)


(SAS manual, surveyfreq, page 6323-6324)

SAS syntaxes

Variance estimation at country level

The following is an example of SAS syntaxes for estimation of standard errors for Number of farms with utilised agricultural area (UAA) in Portugal in a table with dimensions FARMTYPE, SO_EUR (standard output size classes) and AGRAREA (utilised agricultural area size classes).

Because UAA is a CORE variable, PROC SURVEYFREQ uses the Extrapolation factor of the CORE as weight.

*******************************************************************

/*Determine the extrapolation factor and the coverage of the table

*******************************************************************;


data DATA_CORE;

set IFSORA.IFS_T_MAIN_2020 ;


* Our table covers Portugal and the data in the CORE. It covers only the holdings in the main frame (HLD_FEF=”0”).

* We also can select 2 or more countries;

where (country="PT" and EXTPOL_FACT1_CORE is not missing and HLD_FEF="0");


* We make the following replacement, in order to allow valid computations also for censuses. The replacement is not made in the original dataset but in the intermediary table DATA_CORE.

if STRA_IDF_CORE ='_Z' then STRA_ID_CORE=1 ;


* We replace null extrapolation factors with 1 but only to compute the “weight_core”. The fields EXTPOL_FACT*_CORE remain unchanged;

* The first extrapolation factor for CORE is always completed and never null (unlike for the MODULES);

if missing(EXTPOL_FACT2_CORE) then EXTPOL_FACT2_CORE_n=1; else EXTPOL_FACT2_CORE_n=EXTPOL_FACT2_CORE;

if missing(EXTPOL_FACT3_CORE) then EXTPOL_FACT3_CORE_n=1; else EXTPOL_FACT3_CORE_n=EXTPOL_FACT3_CORE;

weight_core=EXTPOL_FACT1_CORE*EXTPOL_FACT2_CORE_n*EXTPOL_FACT3_CORE_n;


if OSU_S1_CORE=. then OSU_S1_CORE=1;

if PSU_CORE=. then PSU_CORE=HLD_ID;


keep country year HLD_ID STRA_ID_CORE PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ;


run;


*****************************************;

*PROC SURVEYFREQ;

******************************************;


** Construction of computational strata, depending on OSU_S1_ and PSU_

*/;


PROC SQL;

   CREATE TABLE _CST1 AS SELECT COUNTRY, STRA_ID_CORE,PSU_CORE,MIN(OSU_S1_CORE) AS OSU_S1_CORE

   FROM DATA_CORE GROUP BY COUNTRY, STRA_ID_CORE,PSU_CORE;

QUIT;


PROC SORT DATA=_CST1;

   BY COUNTRY STRA_ID_CORE OSU_S1_CORE;

RUN;


*** Within each formal stratum STRA_ID_:

The first record of each formal stratum receives _SEQ=1 then the following records of the same stratum receive _SEQ = an incrementing number by 1.

If OSU_S1 (the rank of systematic sampling) is incrementing (so a systematic sampling is used) and the record has _SEQ>2 (so the record is not the first and not the second, these first 2 records form the first computational stratum),  then the formal stratum STRA_ID_ gets split i.e. a new computational stratum _CST is created within the formal stratum STRA_ID_.

The process is iterative (DO - END).

The number of records for each computational stratum is 2 (defined by _SEQ>2).

The split is not done before a record which is the last one in the formal stratum to avoid that the last computational stratum has less than 2 records.

;

DATA _CST;

   SET _CST1;

   RETAIN _CST 1 _SEQ;

   BY COUNTRY STRA_ID_CORE;

   P_OSU_S1_CORE=LAG(OSU_S1_CORE);

   IF FIRST.STRA_ID_CORE THEN DO;

      _SEQ=1;

      P_OSU_S1_CORE=.;

      _CST=1;

   END;

   ELSE _SEQ+1;

   IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ>2 AND NOT(LAST.STRA_ID_CORE) THEN DO;

      _CST+1;

      _SEQ=1;

   END;    

*  DROP _SEQ P_OSU_S1_CORE ;

RUN;


*** Generalisation;

/**** In case performance problems occur with the above syntax, please try using the below syntax instead of the above syntax where you can define _NCST (new computational strata) = 3 or 4 etc.  instead of 2. Please let us (the farm team in E1) know if you had to use this syntax and which value you took for _NCST

/*


%LET _NCST=3;


DATA _CST;

   SET _CST1;

   RETAIN _CST 1 _SEQ;

   BY COUNTRY STRA_ID_CORE;

   P_OSU_S1_CORE=LAG(OSU_S1_CORE);

   IF FIRST.STRA_ID_CORE THEN DO;

      _SEQ=1;

      P_OSU_S1_CORE=.;

      _CST=1;

   END;

   ELSE _SEQ+1;

   IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ>&_NCST. AND NOT(LAST.STRA_ID_CORE) THEN DO;

      _CST+1;

      _SEQ=1;

   END;    

   DROP _SEQ P_OSU_S1_CORE ;

RUN;

*/


PROC SQL;

   CREATE TABLE DATA_CORE AS SELECT A.*,B._CST

   FROM DATA_CORE A

   LEFT JOIN _CST B ON A.COUNTRY=B.COUNTRY AND A.STRA_ID_CORE=B.STRA_ID_CORE AND A.PSU_CORE=B.PSU_CORE;

QUIT;


proc sort data=data_core;

by COUNTRY HLD_ID STRA_ID_CORE _CST PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ;

run;


proc transpose data=data_core out=data_core_bis (rename=(col1=VALUE _name_=CROPS));

by COUNTRY HLD_ID STRA_ID_CORE _CST PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ;

var UAA ;

run;


* transform VALUE into binary variable ;


data data_core_bis;

set data_core_bis;

if value>0 then value=1; else value=0;

run;


data data_core_bis;

set data_core_bis ;

/*Case1: SE calculation for all breakdowns of the dimensions*/ 

dmn_CROPS=FARMTYPE||SO_EUR||AGRAREA||VALUE ; 

* We can write dmn_CROPS=REGIONS||FARMTYPE||SO_EUR||AGRAREA||VALUE if we want Standard error estimates by REGION also; 

 

/*Case2: SE calculation for FARMTYPE=TOTAL breakdown and all breakdowns of SO_EUR and AGRAREA*/ 

dmn_FARMTYPE=SO_EUR||AGRAREA||VALUE; 

 

/*Case3: SE calculation for SO_EUR=TOTAL breakdown and all breakdowns of FARMTYPE and AGRAREA*/ 

dmn_SO_EUR=FARMTYPE||AGRAREA||VALUE ; 

 

/*Case4: SE calculation for AGRAREA=TOTAL breakdown and all breakdowns of FARMTYPE and SO_EUR*/ 

dmn_AGRAREA=FARMTYPE||SO_EUR||VALUE ; 


run;


proc sql;

create table pop_STRA_ID as

selectCOUNTRY, CROPS,  STRA_ID_CORE,_CST, sum(weight_core) as _total_, count(*) as sample

from data_core_bis

group by COUNTRY, CROPS,  STRA_ID_CORE,_CST;

quit;


data pop_STRA_ID;

set pop_STRA_ID;

if _total_ < sample then _total_=sample;

run;


proc sort data=data_core_bis;

by COUNTRY CROPS STRA_ID_CORE _CST;

run;


/*Case1: SE computed for all breakdowns of dimensions*/ 


proc surveyfreq data=data_core_bis total=pop_STRA_ID ;

   tables dmn_CROPS ;

   by COUNTRY CROPS;

   strata STRA_ID_CORE _CST;

   cluster PSU_CORE;

   weight weight_core;

   ods output OneWay=sfreq_CROPS;

run;


* Keep only the SE where VALUE is 1 (meaning for the Number of holdings with the utilised agricultural area>0) ;


data sfreq_CROPS1 (where=(VALUE="1"));;

set sfreq_CROPS;

FARMTYPE=scan(dmn_CROPS, 1, " ");

SO_EUR=scan(dmn_CROPS, 2, " ");

AGRAREA=scan(dmn_CROPS, 3, " ");

VALUE=scan(dmn_CROPS, 4, " ");

run;


The result is the standard errors stored in field SE in table sfreq_CROPS1.

/*Case2: SE computed for FARMTYPE=TOTAL breakdown and all breakdowns of SO_EUR and AGRAREA*/ 

proc surveyfreq data=data_core_bis total=pop_STRA_ID ; 

   tables dmn_FARMTYPE ; 

   by COUNTRY CROPS; 

   strata STRA_ID_CORE _CST;

   cluster PSU_CORE;

   weight weight_core;

   ods output OneWay=sfreq_CROPS; 

run;


data sfreq_CROPS2 (where=(VALUE="1"));; 

set sfreq_CROPS; 

FARMTYPE='TOTAL'; 

SO_EUR=scan(dmn_FARMTYPE, 1" "); 

AGRAREA=scan(dmn_FARMTYPE, 2" "); 

VALUE=scan(dmn_FARMTYPE, 3" "); 

run


The result is the standard errors stored in field SE in table sfreq_CROPS2.


/*Case3: SE computed for SO_EUR=TOTAL breakdown and all breakdowns of FARMTYPE and AGRAREA*/ 

proc surveyfreq data=data_2016_bis total=pop_STRA_ID ; 

   tables dmn_SO_EUR ; 

   by COUNTRY CROPS; 

   strata STRA_ID_CORE _CST;

   cluster PSU_CORE;

   weight weight_core;

   ods output OneWay=sfreq_CROPS; 

run;

 

data sfreq_CROPS3 (where=(VALUE="1"));; 

set sfreq_CROPS; 

SO_EUR='TOTAL'; 

FARMTYPE=scan(dmn_SO_EUR , 1" "); 

AGRAREA=scan(dmn_SO_EUR , 2" "); 

VALUE=scan(dmn_SO_EUR , 3" "); 

run


The result is the standard errors stored in field SE in table sfreq_CROPS3.

/*Case4: SE computed for AGRAREA=TOTAL breakdown and all breakdowns of FARMTYPE and SO_EUR*/ 

proc surveyfreq data=data_2016_bis total=pop_STRA_ID ; 

   tables dmn_AGRAREA ; 

   by COUNTRY CROPS; 

   strata STRA_ID_CORE _CST;

   cluster PSU_CORE;

   weight weight_core;

   ods output OneWay=sfreq_CROPS; 

run;

 

data sfreq_CROPS4 (where=(VALUE="1"));; 

set sfreq_CROPS; 

AGRAREA='TOTAL'; 

FARMTYPE=scan(dmn_AGRAREA , 1" "); 

SO_EUR=scan(dmn_AGRAREA , 2" "); 

VALUE=scan(dmn_AGRAREA , 3" "); 

run


The result is the standard errors stored in field SE in table sfreq_CROPS4.

...