Difference between revisions of "Mock pipeline v2.0"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
| (3 intermediate revisions by the same user not shown) | |||
| Line 64: | Line 64: | ||
|   python setup.py sdist |   python setup.py sdist | ||
|   pip install dist/pyspark-3.0.0.dev0.tar.gz |   pip install dist/pyspark-3.0.0.dev0.tar.gz | ||
| + | |||
| + | === Environment === | ||
| + | |||
| + | Install JupyterLab & extensions | ||
| + | |||
| + |  source ~/envs/mocks3/activate | ||
| + | |||
| + |  pip install jupyterlab jupytext | ||
| + | |||
| + | Open browser to trigger rebuild: | ||
| + | |||
| + |  jupyter-lab --no-browser --ip=$(hostname) | ||
| + | |||
| + | == Input preparation == | ||
| + | |||
| + |  CREATE TABLE tallada.`flagship_v1_1_halos`( | ||
| + |    `bx` tinyint,  | ||
| + |    `by` tinyint,  | ||
| + |    `bz` tinyint,  | ||
| + |    `id` bigint,  | ||
| + |    `pid` bigint,  | ||
| + |    `num_p` int,  | ||
| + |    `x` float,  | ||
| + |    `y` float,  | ||
| + |    `z` float,  | ||
| + |    `vx` float,  | ||
| + |    `vy` float,  | ||
| + |    `vz` float, | ||
| + |    hpix_2_nest smallint, | ||
| + |    hpix_13_nest bigint | ||
| + |  ) | ||
| + |  PARTITIONED BY ( | ||
| + |    z_step smallint | ||
| + |  ) | ||
| + |  CLUSTERED BY ( | ||
| + |    hpix_2_nest | ||
| + |  ) | ||
| + |  SORTED BY (  | ||
| + |    hpix_13_nest | ||
| + |  ) | ||
| + |  INTO 192 BUCKETS | ||
| + |  STORED AS PARQUET | ||
| + |  ; | ||
| + | |||
| + |  INSERT OVERWRITE TABLE tallada.`flagship_v1_1_halos` PARTITION(z_step) | ||
| + |  SELECT | ||
| + |    bx, `by`, bz, | ||
| + |    id, pid, | ||
| + |    num_p, x, y, z, vx, vy, vz, | ||
| + |    default.vec2pix(2, x, y, z), default.vec2pix(13, x, y, z), | ||
| + |    step | ||
| + |  FROM cosmohub.flagship_rockstar_octant1_c AS m | ||
| + |  JOIN flagship_steps AS s | ||
| + |    ON SQRT(m.x*m.x + m.y*m.y + m.z*m.z) BETWEEN s.r_min AND s.r_max | ||
| + |  WHERE x BETWEEN 0 AND 200 | ||
| + |    AND y BETWEEN 0 AND 200 | ||
| + |    AND z>=0 | ||
| + |    AND pid=-1 | ||
| + |  ; | ||
Latest revision as of 10:17, 3 January 2020
Dependencies
- Python 3.8 for PEP 574 (Pickle protocol 5 with out-of-band data)
- Spark 3.0.0 for SPARK-28198 (Add mapPartitionsInPandas to allow an iterator of DataFrames)
Installation
Development Toolset v8
Required in order to compile Python 3.8, as GCC v4.8 is not compatible.
sudo yum install centos-release-scl sudo yum install devtoolset-6 scl enable devtoolset-8 bash
Python 3.8.0
Get system compilation flags using:
>>> import sysconfig
>>> sysconfig.get_config_var('CONFIG_ARGS')
Configure:
export CFLAGS="-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic -D_GNU_SOURCE -fPIC -fwrapv" export LDFLAGS="-Wl,-z,relro -g" export PKG_CONFIG_PATH=:/usr/lib64/pkgconfig:/usr/share/pkgconfig ./configure --prefix=/software/astro/centos7/python/3.8.0 --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu \ --disable-dependency-tracking --enable-ipv6 --enable-shared --with-computed-gotos=yes --with-dbmliborder=gdbm:ndbm:bdb \ --with-system-expat --with-system-ffi --enable-loadable-sqlite-extensions --with-dtrace --with-valgrind --without-ensurepip --enable-optimizations
Compile & install:
make -j 24 make install
Fix RPATH
/software/astro/sl6/patchelf/0.8/bin/patchelf --set-rpath /software/astro/centos7/python/3.8.0/lib/ /software/astro/centos7/python/3.8.0/bin/python3.8
Pip & virtualenv:
cd /tmp curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py /software/astro/centos7/python/3.8.0/bin/python3 get-pip.py /software/astro/centos7/python/3.8.0/bin/pip install virtualenv
Virtual enviroment
cd ~/env /software/astro/centos7/python/3.8.0/bin/virtualenv mocks3 source mocks3/binN/activate
Spark v3.0.0
cd /tmp wget https://www-eu.apache.org/dist/spark/spark-3.0.0-preview/spark-3.0.0-preview-bin-hadoop3.2.tgz tar xvvzf spark-3.0.0-preview-bin-hadoop3.2.tgz mkdir -p /software/astro/centos7/spark/3.8.0 mv spark-3.0.0-preview-bin-hadoop3.2/* /software/astro/centos7/spark/3.8.0 cd /software/astro/centos7/spark/3.8.0/python pip install pypandoc python setup.py sdist pip install dist/pyspark-3.0.0.dev0.tar.gz
Environment
Install JupyterLab & extensions
source ~/envs/mocks3/activate pip install jupyterlab jupytext
Open browser to trigger rebuild:
jupyter-lab --no-browser --ip=$(hostname)
Input preparation
CREATE TABLE tallada.`flagship_v1_1_halos`( `bx` tinyint, `by` tinyint, `bz` tinyint, `id` bigint, `pid` bigint, `num_p` int, `x` float, `y` float, `z` float, `vx` float, `vy` float, `vz` float, hpix_2_nest smallint, hpix_13_nest bigint ) PARTITIONED BY ( z_step smallint ) CLUSTERED BY ( hpix_2_nest ) SORTED BY ( hpix_13_nest ) INTO 192 BUCKETS STORED AS PARQUET ;
INSERT OVERWRITE TABLE tallada.`flagship_v1_1_halos` PARTITION(z_step) SELECT bx, `by`, bz, id, pid, num_p, x, y, z, vx, vy, vz, default.vec2pix(2, x, y, z), default.vec2pix(13, x, y, z), step FROM cosmohub.flagship_rockstar_octant1_c AS m JOIN flagship_steps AS s ON SQRT(m.x*m.x + m.y*m.y + m.z*m.z) BETWEEN s.r_min AND s.r_max WHERE x BETWEEN 0 AND 200 AND y BETWEEN 0 AND 200 AND z>=0 AND pid=-1 ;