Page tree

Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

ULTIMATE CLUSTER APPROXIMATION with stratification at the first stage

HTML
<strong> Variance and coefficient of variation of the total (general case, where weights may be different within a stratum as consequence of non-response adjustment and calibration </strong>  <a href="#var" aria-describedby="footnote-label" id="var-ref"></a>.  

Where:


HTML
<center> <strong> SAS syntaxes </strong> <a href="#sas" aria-describedby="footnote-label" id="sas-ref"></a> </center>



Variance estimation at country level

The following is an example of SAS syntaxes that estimate the coefficients of variation for the total Utilised agricultural area (UAA) (HA) in Portugal in a table with dimensions FARMTYPE, SO_EUR (standard output size classes) and AGRAREA (utilised agricultural area size classes) and in a table with dimensions FARMTYPE and SO_EUR. Because UAA is a CORE variable, PROC SURVEYMEANS uses the Extrapolation factor of the CORE as weight.

*Determine the extrapolation factor and the coverage of the table ;

data DATA_CORE;

set IFSORA.IFS_T_MAIN_2020 ;

*Our table covers Portugal and the data in the CORE. It covers only the holdings in the main frame (HLD_FEF="0");
*We also can select 2 or more countries;

where (country=" PT " and EXTPOL_FACT1_CORE is not missing and HLD_FEF="0");

*We make the following replacement, in order to allow valid computations also for censuses. The replacement is not made in the original dataset but in the intermediary table DATA_CORE;

if STRA_IDF_CORE ='_Z' then STRA_ID_CORE= 1 ;

*We replace null extrapolation factors with 1 but only to compute the "weight_core". The fields EXTPOL_FACT*_CORE remain unchanged;

*The first extrapolation factor for CORE is always completed and never null (unlike for the MODULES);

if missing(EXTPOL_FACT2_CORE) then EXTPOL_FACT2_CORE_n= 1 ; else EXTPOL_FACT2_CORE_n=EXTPOL_FACT2_CORE;

if missing(EXTPOL_FACT3_CORE) then EXTPOL_FACT3_CORE_n= 1 ; else EXTPOL_FACT3_CORE_n=EXTPOL_FACT3_CORE;

weight_core=EXTPOL_FACT1_CORE*EXTPOL_FACT2_CORE_n*EXTPOL_FACT3_CORE_n;

if OSU_S1_CORE=. then OSU_S1_CORE= 1 ;

if PSU_CORE=. then PSU_CORE=HLD_ID;

UAA=UAAS+UAAT;

run ;

**********************************************

PROC SURVEYMEANS;

**********************************************

Construction of computational strata, depending on OSU_S1_ and PSU_ ;

PROC SQL ;

  CREATE TABLE _CST1 AS SELECT

    COUNTRY, STRA_ID_CORE,PSU_CORE,MIN(OSU_S1_CORE) AS OSU_S1_CORE

    FROM DATA_CORE GROUP BY COUNTRY, STRA_ID_CORE,PSU_CORE;

QUIT ;

PROC SORT DATA=_CST1;

   BY COUNTRY STRA_ID_CORE OSU_S1_CORE;

RUN ;

***Within each formal stratum STRA_ID_:

The first record of each formal stratum receives _SEQ=1 then the following records of the same stratum receive _SEQ = an incrementing number by 1.

If OSU_S1 (the rank of systematic sampling) is incrementing (so a systematic sampling is used) and the record has _SEQ>2 (so the record is not the first and not the second, these first 2 records form the first computational stratum), then the formal stratum STRA_ID_ gets split i.e. a new computational stratum _CST is created within the formal stratum STRA_ID_. The process is iterative (DO - END). The number of records for each computational stratum is 2 (defined by _SEQ>2).

The split is not done before a record if that record is the last one in the formal stratum. This is done in order to avoid that the last computational stratum has less than 2 records. ;

DATA _CST;

  SET _CST1;

  RETAIN _CST 1 _SEQ;

  BY COUNTRY STRA_ID_CORE;

  P_OSU_S1_CORE=LAG(OSU_S1_CORE);

  IF FIRST.STRA_ID_CORE THEN DO;

               _SEQ= 1 ;

              P_OSU_S1_CORE= . ;

              _CST= 1 ;

  END;

  ELSE _SEQ+1 ;

  IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ> 2 AND NOT(LAST.STRA_ID_CORE) THEN DO;

             _CST+1 ;

             _SEQ= 1 ;

END;

RUN;

***Generalisation;

/****In case performance problems occur with the above syntax, please try using the below syntax instead of the above syntax where you can define _NCST (new computational strata) = 3 or 4 etc. instead of 2.

Please let us (the Eurostat farm structure team) know if you had to use this syntax and which value you took for _NCST

/*

%LET _NCST=3;

DATA _CST;

   SET _CST1;

   RETAIN _CST 1 _SEQ;

   BY COUNTRY STRA_ID_CORE;

  P_OSU_S1_CORE=LAG(OSU_S1_CORE);

  IF FIRST.STRA_ID_CORE THEN DO;

            _SEQ=1;

           P_OSU_S1_CORE=.;

           _CST=1;

END;

ELSE _SEQ+1;

IF OSU_S1_CORE>P_OSU_S1_CORE AND _SEQ>&_NCST. AND NOT(LAST.STRA_ID_CORE) THEN DO;

          _CST+1;

         _SEQ=1;

END;

DROP _SEQ P_OSU_S1_CORE ;

RUN;

/*;

PROC SQL ;

   CREATE TABLE DATA_CORE AS SELECT A.*,B._CST

   FROM DATA_CORE A

   LEFT JOIN _CST B ON A.COUNTRY=B.COUNTRY AND A.STRA_ID_CORE=B.STRA_ID_CORE AND A.PSU_CORE=B.PSU_CORE;

QUIT ;

PROC SORT DATA=DATA_CORE;

   BY COUNTRYSTRA_ID_CORE _CST;

RUN ;

proc sql ;

create table pop_STRA_ID as

select COUNTRY, STRA_ID_CORE, _CST, sum(weight_core) as _total_, count(*) as sample

from data_core group by COUNTRY, STRA_ID_CORE, _CST;

quit ;

data pop_STRA_ID;

set pop_STRA_ID;

if _total_ < sample then _total_=sample;

run ;

proc sort data=data_core;

by COUNTRY STRA_ID_CORE _CST;

run ;

ods exclude all;

proc surveymeans data=data_core total=pop_STRA_ID sum varsum cvsum clsum ;

by COUNTRY;

domain FARMTYPE*SO_EUR*AGRAREA FARMTYPE*SO_EUR;

var UAA;

strata STRA_ID_CORE _CST;

cluster PSU_CORE;

weight WEIGHT_CORE;

ods output domain=cv;

run ;

The result is the coefficients of variation stored in field CVSUM in the table cv.

...