Difference between revisions of "DESY3 MICE2 source catalog"

From Public PIC Wiki
Jump to navigation Jump to search
 
Line 1: Line 1:
notebook donde convierto los FITS files en parquet files:
+
== FIRST ITERATION ==
  
CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq (
+
There is a notebook donde convierto los FITS files en parquet files:
 +
 
 +
DES_Y3_MICE2_source_catalog_IA_mocks
 +
 
 +
    CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq (
 
     `unique_gal_id` BIGINT,
 
     `unique_gal_id` BIGINT,
 
     `z_cgal` DOUBLE,
 
     `z_cgal` DOUBLE,
Line 21: Line 25:
 
     `eps1` DOUBLE,
 
     `eps1` DOUBLE,
 
     `eps2` DOUBLE
 
     `eps2` DOUBLE
)
+
    )
STORED AS PARQUET
+
    STORED AS PARQUET
LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks'
+
    LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks'
;
+
    ;
  
  
  
CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c (
+
    CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c (
 
   `unique_gal_id` BIGINT,
 
   `unique_gal_id` BIGINT,
 
     `z_cgal` DOUBLE,
 
     `z_cgal` DOUBLE,
Line 48: Line 52:
 
     `eps1` DOUBLE,
 
     `eps1` DOUBLE,
 
     `eps2` DOUBLE
 
     `eps2` DOUBLE
)
+
    )
CLUSTERED BY (
+
    CLUSTERED BY (
  unique_gal_id)  
+
    unique_gal_id)
INTO 8 BUCKETS ROW FORMAT SERDE
+
    INTO 8 BUCKETS ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'  
+
    'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
+
    STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'  
+
    'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
+
    OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
    'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
;
+
    ;
 +
 
 +
 
 +
    INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c
 +
    SELECT *
 +
    FROM jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq;
 +
 
 +
    ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS;
 +
 
 +
    ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS;
 +
 
 +
 
 +
== SECOND ITERATION ==
 +
 
 +
 
 +
It is necessary to add partitions to the table to be able to filter by "tomographic source bin" (which are in the different files).
 +
 
 +
I create another parquet table with partitions.
 +
 
 +
I also create and copy inside the corresponding directories in the hdfs.
 +
 
 +
Then I create the ORC and overwrite it.
 +
 
 +
There is also a google doc written by M.Crocce and Lucas Secco with the info of the catalog:
 +
 
 +
https://docs.google.com/document/d/1PieK-sds2aUpGQMjmwREtXmquwr6EWe2TnOSQ12vRWA/edit
 +
 
 +
These are the commands:
 +
 
 +
    CREATE EXTERNAL TABLE `jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq`(
 +
  `unique_gal_id` bigint,
 +
  `z_cgal` double,
 +
  `ra_gal` double,
 +
  `dec_gal` double,
 +
  `g_sof` double,
 +
  `r_sof` double,
 +
  `i_sof` double,
 +
  `z_sof` double,
 +
  `z_dnf_mean_sof` float,
 +
  `z_dnf_mc_sof` float,
 +
  `des_asahi_full_g_true` float,
 +
  `des_asahi_full_r_true` float,
 +
  `des_asahi_full_i_true` float,
 +
  `des_asahi_full_z_true` float,
 +
  `kappa` float,
 +
  `gamma1` float,
 +
  `gamma2` float,
 +
  `eps1` double,
 +
  `eps2` double)
 +
    PARTITIONED BY (tomographic_bin int)
 +
    ROW FORMAT SERDE
 +
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
 +
    STORED AS INPUTFORMAT
 +
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
 +
    OUTPUTFORMAT
 +
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
 +
    LOCATION
 +
  'hdfs://namenode01/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks'
 +
    ;
 +
 
 +
    msck repair table scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
 +
 
 +
Tienen que tener permisos los directorios!!!
 +
Si no, no funciona y no lee los directorios!
 +
NO funciona como tabla externa básicamente!
 +
 
 +
 
 +
    CREATE TABLE `scattz_sof_mice_feb3rd_maxmag22_5_part_c`(
 +
  `unique_gal_id` bigint COMMENT 'unique galaxy id',
 +
  `z_cgal` double COMMENT 'galaxy true redshift',
 +
  `ra_gal` double COMMENT 'galaxy right ascension (degrees)',
 +
  `dec_gal` double COMMENT 'galaxy declination (degrees)',
 +
  `g_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof g magnitude',
 +
  `r_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof r magnitude',
 +
  `i_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof i magnitude',
 +
  `z_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof z magnitude',
 +
  `z_dnf_mean_sof` float COMMENT 'Photo-z from DNF. Z mean statistic. Dnf run using sof resampled magnitudes',
 +
  `z_dnf_mc_sof` float COMMENT 'Photo-z from DNF. Z Monte-Carlo statistic. Dnf run using sof resampled magnitudes',
 +
  `des_asahi_full_g_true` float COMMENT 'observed magnitude in des asahi full g-band',
 +
  `des_asahi_full_r_true` float COMMENT 'observed magnitude in des asahi full r-band',
 +
  `des_asahi_full_i_true` float COMMENT 'observed magnitude in des asahi full i-band',
 +
  `des_asahi_full_z_true` float COMMENT 'observed magnitude in des asahi full z-band',
 +
  `kappa` float COMMENT 'convergence',
 +
  `gamma1` float COMMENT 'shear',
 +
  `gamma2` float COMMENT 'shear',
 +
  `eps1` double COMMENT 'intrinsic shape',
 +
  `eps2` double COMMENT 'intrinsic shape'
 +
    )
 +
    PARTITIONED BY (tomographic_bin int COMMENT 'tomographic bin')
 +
    CLUSTERED BY (
 +
  unique_gal_id)
 +
    INTO 16 BUCKETS
 +
    ROW FORMAT SERDE  
 +
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'  
 +
    STORED AS INPUTFORMAT  
 +
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'  
 +
    OUTPUTFORMAT  
 +
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
 +
    ;
 +
 
  
 +
    insert overwrite table cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c partition(tomographic_bin) select * from jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
  
INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c
 
SELECT *
 
FROM  jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq;
 
  
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS;
+
    ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c PARTITION(tomographic_bin) COMPUTE STATISTICS;
  
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS;
+
    ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c COMPUTE STATISTICS FOR COLUMNS;

Latest revision as of 22:57, 15 May 2020

FIRST ITERATION

There is a notebook donde convierto los FITS files en parquet files:

DES_Y3_MICE2_source_catalog_IA_mocks

   CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq (
   `unique_gal_id` BIGINT,
   `z_cgal` DOUBLE,
   `ra_gal` DOUBLE,
   `dec_gal` DOUBLE,
   `g_sof` DOUBLE,
   `r_sof` DOUBLE,
   `i_sof` DOUBLE,
   `z_sof` DOUBLE,
   `z_dnf_mean_sof` FLOAT,
   `z_dnf_mc_sof` FLOAT,
   `des_asahi_full_g_true` FLOAT,
   `des_asahi_full_r_true` FLOAT,
   `des_asahi_full_i_true` FLOAT,
   `des_asahi_full_z_true` FLOAT,
   `kappa` FLOAT,
   `gamma1` FLOAT,
   `gamma2` FLOAT,
   `eps1` DOUBLE,
   `eps2` DOUBLE
   )
   STORED AS PARQUET
   LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks'
   ;


   CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c (
  `unique_gal_id` BIGINT,
   `z_cgal` DOUBLE,
   `ra_gal` DOUBLE,
   `dec_gal` DOUBLE,
   `g_sof` DOUBLE,
   `r_sof` DOUBLE,
   `i_sof` DOUBLE,
   `z_sof` DOUBLE,
   `z_dnf_mean_sof` FLOAT,
   `z_dnf_mc_sof` FLOAT,
   `des_asahi_full_g_true` FLOAT,
   `des_asahi_full_r_true` FLOAT,
   `des_asahi_full_i_true` FLOAT,
   `des_asahi_full_z_true` FLOAT,
   `kappa` FLOAT,
   `gamma1` FLOAT,
   `gamma2` FLOAT,
   `eps1` DOUBLE,
   `eps2` DOUBLE
   )
   CLUSTERED BY (
   unique_gal_id) 
   INTO 8 BUCKETS ROW FORMAT SERDE
   'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
   STORED AS INPUTFORMAT
   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
   OUTPUTFORMAT
   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
   ;


   INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c
   SELECT *
   FROM  jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq;
   ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS;
   ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS;


SECOND ITERATION

It is necessary to add partitions to the table to be able to filter by "tomographic source bin" (which are in the different files).

I create another parquet table with partitions.

I also create and copy inside the corresponding directories in the hdfs.

Then I create the ORC and overwrite it.

There is also a google doc written by M.Crocce and Lucas Secco with the info of the catalog:

https://docs.google.com/document/d/1PieK-sds2aUpGQMjmwREtXmquwr6EWe2TnOSQ12vRWA/edit

These are the commands:

   CREATE EXTERNAL TABLE `jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq`(
 `unique_gal_id` bigint, 
 `z_cgal` double, 
 `ra_gal` double, 
 `dec_gal` double, 
 `g_sof` double, 
 `r_sof` double, 
 `i_sof` double, 
 `z_sof` double, 
 `z_dnf_mean_sof` float, 
 `z_dnf_mc_sof` float, 
 `des_asahi_full_g_true` float, 
 `des_asahi_full_r_true` float, 
 `des_asahi_full_i_true` float, 
 `des_asahi_full_z_true` float, 
 `kappa` float, 
 `gamma1` float, 
 `gamma2` float, 
 `eps1` double, 
 `eps2` double)
   PARTITIONED BY (tomographic_bin int)
   ROW FORMAT SERDE 
 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
   STORED AS INPUTFORMAT 
 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
   OUTPUTFORMAT 
 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
   LOCATION
 'hdfs://namenode01/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks'
   ;
   msck repair table scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;

Tienen que tener permisos los directorios!!! Si no, no funciona y no lee los directorios! NO funciona como tabla externa básicamente!


   CREATE TABLE `scattz_sof_mice_feb3rd_maxmag22_5_part_c`(
 `unique_gal_id` bigint COMMENT 'unique galaxy id', 
 `z_cgal` double COMMENT 'galaxy true redshift', 
 `ra_gal` double COMMENT 'galaxy right ascension (degrees)', 
 `dec_gal` double COMMENT 'galaxy declination (degrees)', 
 `g_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof g magnitude', 
 `r_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof r magnitude', 
 `i_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof i magnitude', 
 `z_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof z magnitude', 
 `z_dnf_mean_sof` float COMMENT 'Photo-z from DNF. Z mean statistic. Dnf run using sof resampled magnitudes', 
 `z_dnf_mc_sof` float COMMENT 'Photo-z from DNF. Z Monte-Carlo statistic. Dnf run using sof resampled magnitudes', 
 `des_asahi_full_g_true` float COMMENT 'observed magnitude in des asahi full g-band', 
 `des_asahi_full_r_true` float COMMENT 'observed magnitude in des asahi full r-band', 
 `des_asahi_full_i_true` float COMMENT 'observed magnitude in des asahi full i-band', 
 `des_asahi_full_z_true` float COMMENT 'observed magnitude in des asahi full z-band', 
 `kappa` float COMMENT 'convergence', 
 `gamma1` float COMMENT 'shear', 
 `gamma2` float COMMENT 'shear', 
 `eps1` double COMMENT 'intrinsic shape', 
 `eps2` double COMMENT 'intrinsic shape'
   )
   PARTITIONED BY (tomographic_bin int COMMENT 'tomographic bin')
   CLUSTERED BY ( 
 unique_gal_id) 
   INTO 16 BUCKETS
   ROW FORMAT SERDE 
 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
   STORED AS INPUTFORMAT 
 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
   OUTPUTFORMAT 
 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
   ;


   insert overwrite table cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c partition(tomographic_bin) select * from jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;


   ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c PARTITION(tomographic_bin) COMPUTE STATISTICS;
   ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c COMPUTE STATISTICS FOR COLUMNS;