...
ULTIMATE CLUSTER APPROXIMATION with stratification at the first stage | |||||||||
Variance and standard error for proportion of domain d in the population (general case, where weights may be different within a stratum as consequence of non-response adjustment and calibration) | |||||||||
SAS syntaxes | |||||||||
Variance estimation at country level The following is an example of SAS syntaxes for estimation of standard errors for Number of farms with utilised agricultural area (UAA) in Portugal in a table with dimensions FARMTYPE, SO_EUR (standard output size classes) and AGRAREA (utilised agricultural area size classes). Because UAA is a CORE variable, PROC SURVEYFREQ uses the Extrapolation factor of the CORE as weight. ******************************************************************* /*Determine the extrapolation factor and the coverage of the table *******************************************************************; data DATA_CORE; set IFSORA.IFS_T_MAIN_2020 ; * Our table covers Portugal and the data in the CORE. It covers only the holdings in the main frame (HLD_FEF=”0”). * We also can select 2 or more countries; where (country="PT" and EXTPOL_FACT1_CORE is not missing and HLD_FEF="0"); * We make the following replacement, in order to allow valid computations also for censuses. The replacement is not made in the original dataset but in the intermediary table DATA_CORE. if STRA_IDF_CORE ='_Z' then STRA_ID_CORE=1 ; * We replace null extrapolation factors with 1 but only to compute the “weight_core”. The fields EXTPOL_FACT*_CORE remain unchanged; * The first extrapolation factor for CORE is always completed and never null (unlike for the MODULES); if missing(EXTPOL_FACT2_CORE) then EXTPOL_FACT2_CORE_n=1; else EXTPOL_FACT2_CORE_n=EXTPOL_FACT2_CORE; if missing(EXTPOL_FACT3_CORE) then EXTPOL_FACT3_CORE_n=1; else EXTPOL_FACT3_CORE_n=EXTPOL_FACT3_CORE; weight_core=EXTPOL_FACT1_CORE*EXTPOL_FACT2_CORE_n*EXTPOL_FACT3_CORE_n; if OSU_S1_CORE=. then OSU_S1_CORE=1; if PSU_CORE=. then PSU_CORE=HLD_ID; keep country year HLD_ID STRA_ID_CORE PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ; run; *****************************************; *PROC SURVEYFREQ; ******************************************; ** Construction of computational strata, depending on OSU_S1_ and PSU_ */; PROC SQL; CREATE TABLE _CST1 AS SELECT COUNTRY, STRA_ID_CORE,PSU_CORE,MIN(OSU_S1_CORE) AS OSU_S1_CORE FROM DATA_CORE GROUP BY COUNTRY, STRA_ID_CORE,PSU_CORE; QUIT; PROC SORT DATA=_CST1; BY COUNTRY STRA_ID_CORE OSU_S1_CORE; RUN; *** Within each formal stratum STRA_ID_: The first record of each formal stratum receives _SEQ=1 then the following records of the same stratum receive _SEQ = an incrementing number by 1. If OSU_S1 (the rank of systematic sampling) is incrementing (so a systematic sampling is used) and the record has _SEQ>2 (so the record is not the first and not the second, these first 2 records form the first computational stratum), then the formal stratum STRA_ID_ gets split i.e. a new computational stratum _CST is created within the formal stratum STRA_ID_. The process is iterative (DO - END). The number of records for each computational stratum is 2 (defined by _SEQ>2). The split is not done before a record which is the last one in the formal stratum to avoid that the last computational stratum has less than 2 records. ; DATA _CST; SET _CST1; RETAIN _CST 1 _SEQ; BY COUNTRY STRA_ID_CORE; P_OSU_S1_CORE=LAG(OSU_S1_CORE); IF FIRST.STRA_ID_CORE THEN DO; _SEQ=1; P_OSU_S1_CORE=.; _CST=1; END; ELSE _SEQ+1; IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ>2 AND NOT(LAST.STRA_ID_CORE) THEN DO; _CST+1; _SEQ=1; END; * DROP _SEQ P_OSU_S1_CORE ; RUN; *** Generalisation; /**** In case performance problems occur with the above syntax, please try using the below syntax instead of the above syntax where you can define _NCST (new computational strata) = 3 or 4 etc. instead of 2. Please let us (the farm team in E1) know if you had to use this syntax and which value you took for _NCST /* %LET _NCST=3; DATA _CST; SET _CST1; RETAIN _CST 1 _SEQ; BY COUNTRY STRA_ID_CORE; P_OSU_S1_CORE=LAG(OSU_S1_CORE); IF FIRST.STRA_ID_CORE THEN DO; _SEQ=1; P_OSU_S1_CORE=.; _CST=1; END; ELSE _SEQ+1; IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ>&_NCST. AND NOT(LAST.STRA_ID_CORE) THEN DO; _CST+1; _SEQ=1; END; DROP _SEQ P_OSU_S1_CORE ; RUN; */ PROC SQL; CREATE TABLE DATA_CORE AS SELECT A.*,B._CST FROM DATA_CORE A LEFT JOIN _CST B ON A.COUNTRY=B.COUNTRY AND A.STRA_ID_CORE=B.STRA_ID_CORE AND A.PSU_CORE=B.PSU_CORE; QUIT; proc sort data=data_core; by COUNTRY HLD_ID STRA_ID_CORE _CST PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ; run; proc transpose data=data_core out=data_core_bis (rename=(col1=VALUE _name_=CROPS)); by COUNTRY HLD_ID STRA_ID_CORE _CST PSU_CORE OSU_S1_CORE weight_core farmtype SO_EUR AGRAREA UAA ; var UAA ; run; * transform VALUE into binary variable ; data data_core_bis; set data_core_bis; if value>0 then value=1; else value=0; run; data data_core_bis; set data_core_bis ; /*Case1: SE calculation for all breakdowns of the dimensions*/ dmn_CROPS=FARMTYPE||SO_EUR||AGRAREA||VALUE ; * We can write dmn_CROPS=REGIONS||FARMTYPE||SO_EUR||AGRAREA||VALUE if we want Standard error estimates by REGION also;
/*Case2: SE calculation for FARMTYPE=TOTAL breakdown and all breakdowns of SO_EUR and AGRAREA*/ dmn_FARMTYPE=SO_EUR||AGRAREA||VALUE;
/*Case3: SE calculation for SO_EUR=TOTAL breakdown and all breakdowns of FARMTYPE and AGRAREA*/ dmn_SO_EUR=FARMTYPE||AGRAREA||VALUE ;
/*Case4: SE calculation for AGRAREA=TOTAL breakdown and all breakdowns of FARMTYPE and SO_EUR*/ dmn_AGRAREA=FARMTYPE||SO_EUR||VALUE ; run; proc sql; create table pop_STRA_ID as selectCOUNTRY, CROPS, STRA_ID_CORE,_CST, sum(weight_core) as _total_, count(*) as sample from data_core_bis group by COUNTRY, CROPS, STRA_ID_CORE,_CST; quit; data pop_STRA_ID; set pop_STRA_ID; if _total_ < sample then _total_=sample; run; proc sort data=data_core_bis; by COUNTRY CROPS STRA_ID_CORE _CST; run; /*Case1: SE computed for all breakdowns of dimensions*/ proc surveyfreq data=data_core_bis total=pop_STRA_ID ; tables dmn_CROPS ; by COUNTRY CROPS; strata STRA_ID_CORE _CST; cluster PSU_CORE; weight weight_core; ods output OneWay=sfreq_CROPS; run; * Keep only the SE where VALUE is 1 (meaning for the Number of holdings with the utilised agricultural area>0) ; data sfreq_CROPS1 (where=(VALUE="1"));; set sfreq_CROPS; FARMTYPE=scan(dmn_CROPS, 1, " "); SO_EUR=scan(dmn_CROPS, 2, " "); AGRAREA=scan(dmn_CROPS, 3, " "); VALUE=scan(dmn_CROPS, 4, " "); run; The result is the standard errors stored in field SE in table sfreq_CROPS1. /*Case2: SE computed for FARMTYPE=TOTAL breakdown and all breakdowns of SO_EUR and AGRAREA*/ proc surveyfreq data=data_core_bis total=pop_STRA_ID ; tables dmn_FARMTYPE ; by COUNTRY CROPS; strata STRA_ID_CORE _CST; cluster PSU_CORE; weight weight_core; ods output OneWay=sfreq_CROPS; run; data sfreq_CROPS2 (where=(VALUE="1"));; set sfreq_CROPS; FARMTYPE='TOTAL'; SO_EUR=scan(dmn_FARMTYPE, 1, " "); AGRAREA=scan(dmn_FARMTYPE, 2, " "); VALUE=scan(dmn_FARMTYPE, 3, " "); run; The result is the standard errors stored in field SE in table sfreq_CROPS2. /*Case3: SE computed for SO_EUR=TOTAL breakdown and all breakdowns of FARMTYPE and AGRAREA*/ proc surveyfreq data=data_2016_bis total=pop_STRA_ID ; tables dmn_SO_EUR ; by COUNTRY CROPS; strata STRA_ID_CORE _CST; cluster PSU_CORE; weight weight_core; ods output OneWay=sfreq_CROPS; run;
data sfreq_CROPS3 (where=(VALUE="1"));; set sfreq_CROPS; SO_EUR='TOTAL'; FARMTYPE=scan(dmn_SO_EUR , 1, " "); AGRAREA=scan(dmn_SO_EUR , 2, " "); VALUE=scan(dmn_SO_EUR , 3, " "); run; The result is the standard errors stored in field SE in table sfreq_CROPS3. /*Case4: SE computed for AGRAREA=TOTAL breakdown and all breakdowns of FARMTYPE and SO_EUR*/ proc surveyfreq data=data_2016_bis total=pop_STRA_ID ; tables dmn_AGRAREA ; by COUNTRY CROPS; strata STRA_ID_CORE _CST; cluster PSU_CORE; weight weight_core; ods output OneWay=sfreq_CROPS; run;
data sfreq_CROPS4 (where=(VALUE="1"));; set sfreq_CROPS; AGRAREA='TOTAL'; FARMTYPE=scan(dmn_AGRAREA , 1, " "); SO_EUR=scan(dmn_AGRAREA , 2, " "); VALUE=scan(dmn_AGRAREA , 3, " "); run; The result is the standard errors stored in field SE in table sfreq_CROPS4. |
...