Mock pipeline v2.0

Dependencies

Python 3.8 for PEP 574 (Pickle protocol 5 with out-of-band data)
Spark 3.0.0 for SPARK-28198 (Add mapPartitionsInPandas to allow an iterator of DataFrames)

Installation

Development Toolset v8

Required in order to compile Python 3.8, as GCC v4.8 is not compatible.

sudo yum install centos-release-scl
sudo yum install devtoolset-6
scl enable devtoolset-8 bash

Python 3.8.0

Get system compilation flags using:

>>> import sysconfig
>>> sysconfig.get_config_var('CONFIG_ARGS')

Configure:

export CFLAGS="-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic -D_GNU_SOURCE -fPIC -fwrapv"
export LDFLAGS="-Wl,-z,relro  -g"
export PKG_CONFIG_PATH=:/usr/lib64/pkgconfig:/usr/share/pkgconfig
./configure --prefix=/software/astro/centos7/python/3.8.0 --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \
--disable-dependency-tracking --enable-ipv6 --enable-shared --with-computed-gotos=yes --with-dbmliborder=gdbm:ndbm:bdb \
--with-system-expat --with-system-ffi --enable-loadable-sqlite-extensions --with-dtrace --with-valgrind --without-ensurepip --enable-optimizations

Compile & install:

make -j 24
make install

Fix RPATH

/software/astro/sl6/patchelf/0.8/bin/patchelf --set-rpath /software/astro/centos7/python/3.8.0/lib/ /software/astro/centos7/python/3.8.0/bin/python3.8

Pip & virtualenv:

cd /tmp
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
/software/astro/centos7/python/3.8.0/bin/python3 get-pip.py
/software/astro/centos7/python/3.8.0/bin/pip install virtualenv

Virtual enviroment

cd ~/env
/software/astro/centos7/python/3.8.0/bin/virtualenv mocks3
source mocks3/binN/activate

Spark v3.0.0

cd /tmp
wget https://www-eu.apache.org/dist/spark/spark-3.0.0-preview/spark-3.0.0-preview-bin-hadoop3.2.tgz
tar xvvzf spark-3.0.0-preview-bin-hadoop3.2.tgz
mkdir -p /software/astro/centos7/spark/3.8.0
mv spark-3.0.0-preview-bin-hadoop3.2/* /software/astro/centos7/spark/3.8.0

cd /software/astro/centos7/spark/3.8.0/python
pip install pypandoc
python setup.py sdist
pip install dist/pyspark-3.0.0.dev0.tar.gz

Environment

Install JupyterLab & extensions

source ~/envs/mocks3/activate

pip install jupyterlab jupytext

Open browser to trigger rebuild:

jupyter-lab --no-browser --ip=$(hostname)

Input preparation

CREATE TABLE tallada.`flagship_v1_1_halos`(
  `bx` tinyint, 
  `by` tinyint, 
  `bz` tinyint, 
  `id` bigint, 
  `pid` bigint, 
  `num_p` int, 
  `x` float, 
  `y` float, 
  `z` float, 
  `vx` float, 
  `vy` float, 
  `vz` float,
  hpix_2_nest smallint,
  hpix_13_nest bigint
)
PARTITIONED BY (
  z_step smallint
)
CLUSTERED BY (
  hpix_2_nest
)
SORTED BY ( 
  hpix_13_nest
)
INTO 192 BUCKETS
STORED AS PARQUET
;

INSERT OVERWRITE TABLE tallada.`flagship_v1_1_halos` PARTITION(z_step)
SELECT
  bx, `by`, bz,
  id, pid,
  num_p, x, y, z, vx, vy, vz,
  default.vec2pix(2, x, y, z), default.vec2pix(13, x, y, z),
  step
FROM cosmohub.flagship_rockstar_octant1_c AS m
JOIN flagship_steps AS s
  ON SQRT(m.x*m.x + m.y*m.y + m.z*m.z) BETWEEN s.r_min AND s.r_max
WHERE x BETWEEN 0 AND 200
  AND y BETWEEN 0 AND 200
  AND z>=0
  AND pid=-1
;