DESY3 MICE2 source catalog
Jump to navigation
Jump to search
FIRST ITERATION
There is a notebook donde convierto los FITS files en parquet files:
DES_Y3_MICE2_source_catalog_IA_mocks
CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq ( `unique_gal_id` BIGINT, `z_cgal` DOUBLE, `ra_gal` DOUBLE, `dec_gal` DOUBLE, `g_sof` DOUBLE, `r_sof` DOUBLE, `i_sof` DOUBLE, `z_sof` DOUBLE, `z_dnf_mean_sof` FLOAT, `z_dnf_mc_sof` FLOAT, `des_asahi_full_g_true` FLOAT, `des_asahi_full_r_true` FLOAT, `des_asahi_full_i_true` FLOAT, `des_asahi_full_z_true` FLOAT, `kappa` FLOAT, `gamma1` FLOAT, `gamma2` FLOAT, `eps1` DOUBLE, `eps2` DOUBLE ) STORED AS PARQUET LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' ;
CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c ( `unique_gal_id` BIGINT, `z_cgal` DOUBLE, `ra_gal` DOUBLE, `dec_gal` DOUBLE, `g_sof` DOUBLE, `r_sof` DOUBLE, `i_sof` DOUBLE, `z_sof` DOUBLE, `z_dnf_mean_sof` FLOAT, `z_dnf_mc_sof` FLOAT, `des_asahi_full_g_true` FLOAT, `des_asahi_full_r_true` FLOAT, `des_asahi_full_i_true` FLOAT, `des_asahi_full_z_true` FLOAT, `kappa` FLOAT, `gamma1` FLOAT, `gamma2` FLOAT, `eps1` DOUBLE, `eps2` DOUBLE ) CLUSTERED BY ( unique_gal_id) INTO 8 BUCKETS ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c SELECT * FROM jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq;
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS;
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS;
SECOND ITERATION
It is necessary to add partitions to the table to be able to filter by "tomographic source bin" (which are in the different files).
I create another parquet table with partitions.
I also create and copy inside the corresponding directories in the hdfs.
Then I create the ORC and overwrite it.
There is also a google doc written by M.Crocce and Lucas Secco with the info of the catalog:
https://docs.google.com/document/d/1PieK-sds2aUpGQMjmwREtXmquwr6EWe2TnOSQ12vRWA/edit
These are the commands:
CREATE EXTERNAL TABLE `jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq`( `unique_gal_id` bigint, `z_cgal` double, `ra_gal` double, `dec_gal` double, `g_sof` double, `r_sof` double, `i_sof` double, `z_sof` double, `z_dnf_mean_sof` float, `z_dnf_mc_sof` float, `des_asahi_full_g_true` float, `des_asahi_full_r_true` float, `des_asahi_full_i_true` float, `des_asahi_full_z_true` float, `kappa` float, `gamma1` float, `gamma2` float, `eps1` double, `eps2` double) PARTITIONED BY (tomographic_bin int) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 'hdfs://namenode01/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' ;
msck repair table scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
Tienen que tener permisos los directorios!!! Si no, no funciona y no lee los directorios! NO funciona como tabla externa básicamente!
CREATE TABLE `scattz_sof_mice_feb3rd_maxmag22_5_part_c`( `unique_gal_id` bigint COMMENT 'unique galaxy id', `z_cgal` double COMMENT 'galaxy true redshift', `ra_gal` double COMMENT 'galaxy right ascension (degrees)', `dec_gal` double COMMENT 'galaxy declination (degrees)', `g_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof g magnitude', `r_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof r magnitude', `i_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof i magnitude', `z_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof z magnitude', `z_dnf_mean_sof` float COMMENT 'Photo-z from DNF. Z mean statistic. Dnf run using sof resampled magnitudes', `z_dnf_mc_sof` float COMMENT 'Photo-z from DNF. Z Monte-Carlo statistic. Dnf run using sof resampled magnitudes', `des_asahi_full_g_true` float COMMENT 'observed magnitude in des asahi full g-band', `des_asahi_full_r_true` float COMMENT 'observed magnitude in des asahi full r-band', `des_asahi_full_i_true` float COMMENT 'observed magnitude in des asahi full i-band', `des_asahi_full_z_true` float COMMENT 'observed magnitude in des asahi full z-band', `kappa` float COMMENT 'convergence', `gamma1` float COMMENT 'shear', `gamma2` float COMMENT 'shear', `eps1` double COMMENT 'intrinsic shape', `eps2` double COMMENT 'intrinsic shape' ) PARTITIONED BY (tomographic_bin int COMMENT 'tomographic bin') CLUSTERED BY ( unique_gal_id) INTO 16 BUCKETS ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
insert overwrite table cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c partition(tomographic_bin) select * from jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c PARTITION(tomographic_bin) COMPUTE STATISTICS;
ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c COMPUTE STATISTICS FOR COLUMNS;