Difference between revisions of "DESY3 MICE2 source catalog"
Jump to navigation
Jump to search
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
− | + | == FIRST ITERATION == | |
− | CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq ( | + | There is a notebook donde convierto los FITS files en parquet files: |
+ | |||
+ | DES_Y3_MICE2_source_catalog_IA_mocks | ||
+ | |||
+ | CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq ( | ||
`unique_gal_id` BIGINT, | `unique_gal_id` BIGINT, | ||
`z_cgal` DOUBLE, | `z_cgal` DOUBLE, | ||
Line 21: | Line 25: | ||
`eps1` DOUBLE, | `eps1` DOUBLE, | ||
`eps2` DOUBLE | `eps2` DOUBLE | ||
− | ) | + | ) |
− | STORED AS PARQUET | + | STORED AS PARQUET |
− | LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' | + | LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' |
− | ; | + | ; |
− | CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c ( | + | CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c ( |
`unique_gal_id` BIGINT, | `unique_gal_id` BIGINT, | ||
`z_cgal` DOUBLE, | `z_cgal` DOUBLE, | ||
Line 48: | Line 52: | ||
`eps1` DOUBLE, | `eps1` DOUBLE, | ||
`eps2` DOUBLE | `eps2` DOUBLE | ||
− | ) | + | ) |
− | CLUSTERED BY ( | + | CLUSTERED BY ( |
− | unique_gal_id) | + | unique_gal_id) |
− | INTO | + | INTO 8 BUCKETS ROW FORMAT SERDE |
− | 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' | + | 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' |
− | STORED AS INPUTFORMAT | + | STORED AS INPUTFORMAT |
− | 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' | + | 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' |
− | OUTPUTFORMAT | + | OUTPUTFORMAT |
− | 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' | + | 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' |
− | ; | + | ; |
+ | |||
+ | |||
+ | INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c | ||
+ | SELECT * | ||
+ | FROM jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq; | ||
+ | |||
+ | ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS; | ||
+ | |||
+ | ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS; | ||
+ | |||
+ | |||
+ | == SECOND ITERATION == | ||
+ | |||
+ | |||
+ | It is necessary to add partitions to the table to be able to filter by "tomographic source bin" (which are in the different files). | ||
+ | |||
+ | I create another parquet table with partitions. | ||
+ | |||
+ | I also create and copy inside the corresponding directories in the hdfs. | ||
+ | |||
+ | Then I create the ORC and overwrite it. | ||
+ | |||
+ | There is also a google doc written by M.Crocce and Lucas Secco with the info of the catalog: | ||
+ | |||
+ | https://docs.google.com/document/d/1PieK-sds2aUpGQMjmwREtXmquwr6EWe2TnOSQ12vRWA/edit | ||
+ | |||
+ | These are the commands: | ||
+ | |||
+ | CREATE EXTERNAL TABLE `jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq`( | ||
+ | `unique_gal_id` bigint, | ||
+ | `z_cgal` double, | ||
+ | `ra_gal` double, | ||
+ | `dec_gal` double, | ||
+ | `g_sof` double, | ||
+ | `r_sof` double, | ||
+ | `i_sof` double, | ||
+ | `z_sof` double, | ||
+ | `z_dnf_mean_sof` float, | ||
+ | `z_dnf_mc_sof` float, | ||
+ | `des_asahi_full_g_true` float, | ||
+ | `des_asahi_full_r_true` float, | ||
+ | `des_asahi_full_i_true` float, | ||
+ | `des_asahi_full_z_true` float, | ||
+ | `kappa` float, | ||
+ | `gamma1` float, | ||
+ | `gamma2` float, | ||
+ | `eps1` double, | ||
+ | `eps2` double) | ||
+ | PARTITIONED BY (tomographic_bin int) | ||
+ | ROW FORMAT SERDE | ||
+ | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | ||
+ | STORED AS INPUTFORMAT | ||
+ | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | ||
+ | OUTPUTFORMAT | ||
+ | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | ||
+ | LOCATION | ||
+ | 'hdfs://namenode01/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' | ||
+ | ; | ||
+ | |||
+ | msck repair table scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq; | ||
+ | |||
+ | Tienen que tener permisos los directorios!!! | ||
+ | Si no, no funciona y no lee los directorios! | ||
+ | NO funciona como tabla externa básicamente! | ||
+ | |||
+ | |||
+ | CREATE TABLE `scattz_sof_mice_feb3rd_maxmag22_5_part_c`( | ||
+ | `unique_gal_id` bigint COMMENT 'unique galaxy id', | ||
+ | `z_cgal` double COMMENT 'galaxy true redshift', | ||
+ | `ra_gal` double COMMENT 'galaxy right ascension (degrees)', | ||
+ | `dec_gal` double COMMENT 'galaxy declination (degrees)', | ||
+ | `g_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof g magnitude', | ||
+ | `r_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof r magnitude', | ||
+ | `i_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof i magnitude', | ||
+ | `z_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof z magnitude', | ||
+ | `z_dnf_mean_sof` float COMMENT 'Photo-z from DNF. Z mean statistic. Dnf run using sof resampled magnitudes', | ||
+ | `z_dnf_mc_sof` float COMMENT 'Photo-z from DNF. Z Monte-Carlo statistic. Dnf run using sof resampled magnitudes', | ||
+ | `des_asahi_full_g_true` float COMMENT 'observed magnitude in des asahi full g-band', | ||
+ | `des_asahi_full_r_true` float COMMENT 'observed magnitude in des asahi full r-band', | ||
+ | `des_asahi_full_i_true` float COMMENT 'observed magnitude in des asahi full i-band', | ||
+ | `des_asahi_full_z_true` float COMMENT 'observed magnitude in des asahi full z-band', | ||
+ | `kappa` float COMMENT 'convergence', | ||
+ | `gamma1` float COMMENT 'shear', | ||
+ | `gamma2` float COMMENT 'shear', | ||
+ | `eps1` double COMMENT 'intrinsic shape', | ||
+ | `eps2` double COMMENT 'intrinsic shape' | ||
+ | ) | ||
+ | PARTITIONED BY (tomographic_bin int COMMENT 'tomographic bin') | ||
+ | CLUSTERED BY ( | ||
+ | unique_gal_id) | ||
+ | INTO 16 BUCKETS | ||
+ | ROW FORMAT SERDE | ||
+ | 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' | ||
+ | STORED AS INPUTFORMAT | ||
+ | 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' | ||
+ | OUTPUTFORMAT | ||
+ | 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' | ||
+ | ; | ||
+ | |||
+ | insert overwrite table cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c partition(tomographic_bin) select * from jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq; | ||
− | |||
− | |||
− | |||
− | ANALYZE TABLE cosmohub. | + | ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c PARTITION(tomographic_bin) COMPUTE STATISTICS; |
− | ANALYZE TABLE cosmohub. | + | ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c COMPUTE STATISTICS FOR COLUMNS; |
Latest revision as of 22:57, 15 May 2020
FIRST ITERATION
There is a notebook donde convierto los FITS files en parquet files:
DES_Y3_MICE2_source_catalog_IA_mocks
CREATE EXTERNAL TABLE jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq ( `unique_gal_id` BIGINT, `z_cgal` DOUBLE, `ra_gal` DOUBLE, `dec_gal` DOUBLE, `g_sof` DOUBLE, `r_sof` DOUBLE, `i_sof` DOUBLE, `z_sof` DOUBLE, `z_dnf_mean_sof` FLOAT, `z_dnf_mc_sof` FLOAT, `des_asahi_full_g_true` FLOAT, `des_asahi_full_r_true` FLOAT, `des_asahi_full_i_true` FLOAT, `des_asahi_full_z_true` FLOAT, `kappa` FLOAT, `gamma1` FLOAT, `gamma2` FLOAT, `eps1` DOUBLE, `eps2` DOUBLE ) STORED AS PARQUET LOCATION '/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' ;
CREATE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c ( `unique_gal_id` BIGINT, `z_cgal` DOUBLE, `ra_gal` DOUBLE, `dec_gal` DOUBLE, `g_sof` DOUBLE, `r_sof` DOUBLE, `i_sof` DOUBLE, `z_sof` DOUBLE, `z_dnf_mean_sof` FLOAT, `z_dnf_mc_sof` FLOAT, `des_asahi_full_g_true` FLOAT, `des_asahi_full_r_true` FLOAT, `des_asahi_full_i_true` FLOAT, `des_asahi_full_z_true` FLOAT, `kappa` FLOAT, `gamma1` FLOAT, `gamma2` FLOAT, `eps1` DOUBLE, `eps2` DOUBLE ) CLUSTERED BY ( unique_gal_id) INTO 8 BUCKETS ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
INSERT OVERWRITE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c SELECT * FROM jcarrete.scattZ_sof_mice_Feb3rd_maxmag22_5_pq;
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS;
ANALYZE TABLE cosmohub.scattZ_sof_mice_Feb3rd_maxmag22_5_c COMPUTE STATISTICS FOR COLUMNS;
SECOND ITERATION
It is necessary to add partitions to the table to be able to filter by "tomographic source bin" (which are in the different files).
I create another parquet table with partitions.
I also create and copy inside the corresponding directories in the hdfs.
Then I create the ORC and overwrite it.
There is also a google doc written by M.Crocce and Lucas Secco with the info of the catalog:
https://docs.google.com/document/d/1PieK-sds2aUpGQMjmwREtXmquwr6EWe2TnOSQ12vRWA/edit
These are the commands:
CREATE EXTERNAL TABLE `jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq`( `unique_gal_id` bigint, `z_cgal` double, `ra_gal` double, `dec_gal` double, `g_sof` double, `r_sof` double, `i_sof` double, `z_sof` double, `z_dnf_mean_sof` float, `z_dnf_mc_sof` float, `des_asahi_full_g_true` float, `des_asahi_full_r_true` float, `des_asahi_full_i_true` float, `des_asahi_full_z_true` float, `kappa` float, `gamma1` float, `gamma2` float, `eps1` double, `eps2` double) PARTITIONED BY (tomographic_bin int) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 'hdfs://namenode01/user/jcarrete/data/mice/des_y3_mice2_source_catalog_IA_mocks' ;
msck repair table scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
Tienen que tener permisos los directorios!!! Si no, no funciona y no lee los directorios! NO funciona como tabla externa básicamente!
CREATE TABLE `scattz_sof_mice_feb3rd_maxmag22_5_part_c`( `unique_gal_id` bigint COMMENT 'unique galaxy id', `z_cgal` double COMMENT 'galaxy true redshift', `ra_gal` double COMMENT 'galaxy right ascension (degrees)', `dec_gal` double COMMENT 'galaxy declination (degrees)', `g_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof g magnitude', `r_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof r magnitude', `i_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof i magnitude', `z_sof` double COMMENT 'observed g-magnitude resampled from DES Y3Gold2 sof z magnitude', `z_dnf_mean_sof` float COMMENT 'Photo-z from DNF. Z mean statistic. Dnf run using sof resampled magnitudes', `z_dnf_mc_sof` float COMMENT 'Photo-z from DNF. Z Monte-Carlo statistic. Dnf run using sof resampled magnitudes', `des_asahi_full_g_true` float COMMENT 'observed magnitude in des asahi full g-band', `des_asahi_full_r_true` float COMMENT 'observed magnitude in des asahi full r-band', `des_asahi_full_i_true` float COMMENT 'observed magnitude in des asahi full i-band', `des_asahi_full_z_true` float COMMENT 'observed magnitude in des asahi full z-band', `kappa` float COMMENT 'convergence', `gamma1` float COMMENT 'shear', `gamma2` float COMMENT 'shear', `eps1` double COMMENT 'intrinsic shape', `eps2` double COMMENT 'intrinsic shape' ) PARTITIONED BY (tomographic_bin int COMMENT 'tomographic bin') CLUSTERED BY ( unique_gal_id) INTO 16 BUCKETS ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
insert overwrite table cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c partition(tomographic_bin) select * from jcarrete.scattz_sof_mice_feb3rd_maxmag22_5_partitioned_pq;
ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c PARTITION(tomographic_bin) COMPUTE STATISTICS;
ANALYZE TABLE cosmohub.scattz_sof_mice_feb3rd_maxmag22_5_part_c COMPUTE STATISTICS FOR COLUMNS;