Skip to content

Instantly share code, notes, and snippets.

@jitsejan
Last active April 9, 2024 11:15
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save jitsejan/f3991e5be9495e17aedc16b6512bd209 to your computer and use it in GitHub Desktop.
Save jitsejan/f3991e5be9495e17aedc16b6512bd209 to your computer and use it in GitHub Desktop.
PySpark, Docker and S3
from pyspark import SparkContext, SparkConf, SQLContext
conf = (
SparkConf()
.set("spark.hadoop.fs.s3a.path.style.access", True)
.set("spark.hadoop.fs.s3a.access.key", profile_info.get('aws_access_key_id'))
.set("spark.hadoop.fs.s3a.secret.key", profile_info.get('aws_secret_access_key'))
.set("spark.hadoop.fs.s3a.endpoint", f"s3-{profile_info.get('region')}.amazonaws.com")
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.set("com.amazonaws.services.s3.enableV4", True)
.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
)
from configparser import ConfigParser
config_object = ConfigParser()
config_object.read("/home/jovyan/.aws/credentials")
profile_info = config_object["prod"]
[prod]
aws_access_key_id = xxxxxxyyyyyyy
aws_secret_access_key = zzzzzzzzyyyyyyy
region = eu-west-2
version: '3'
services:
jitsejan-pyspark:
user: root
privileged: true
image: jitsejan/pyspark-notebook
restart: always
volumes:
- ./notebooks:/opt/notebooks
- ./data:/opt/data
- $HOME/.aws/credentials:/home/jovyan/.aws/credentials:ro
environment:
- GRANT_SUDO=yes
ports:
- "8488:8488"
FROM jupyter/pyspark-notebook
USER root
# Add essential packages
RUN apt-get update && apt-get install -y build-essential curl git gnupg2 nano apt-transport-https software-properties-common
# Set locale
RUN apt-get update && apt-get install -y locales \
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \
&& locale-gen
# Add config to Jupyter notebook
COPY jupyter/jupyter_notebook_config.py /home/jovyan/.jupyter/
RUN chmod -R 777 /home/jovyan/
# Spark libraries
RUN wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -P $SPARK_HOME/jars/
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar -P $SPARK_HOME/jars/
USER $NB_USER
# Install Python requirements
COPY requirements.txt /home/jovyan/
RUN pip install -r /home/jovyan/requirements.txt
# Install NLTK
RUN python -c "import nltk; nltk.download('popular')"
# Custom styling
RUN mkdir -p /home/jovyan/.jupyter/custom
COPY custom/custom.css /home/jovyan/.jupyter/custom/
# NB extensions
RUN jupyter contrib nbextension install --user
RUN jupyter nbextensions_configurator enable --user
# Run the notebook
CMD ["/opt/conda/bin/jupyter", "lab", "--allow-root"]
c = get_config()
c.InteractiveShell.ast_node_interactivity = "all"
c.NotebookApp.allow_origin = '*'
c.NotebookApp.ip = '*'
c.NotebookApp.notebook_dir = '/opt/notebooks/'
c.NotebookApp.open_browser = False
c.NotebookApp.password = u'sha1:a123:345345'
c.NotebookApp.port = 8488
sc = SparkContext(conf=conf).getOrCreate()
sqlContext = SQLContext(sc)
df = sqlContext.read.parquet("s3a://datalake/warehouse/platform/company_list/")
@jitsejan
Copy link
Author

jitsejan commented Apr 8, 2020

Hi there, What is the password? u'sha1:a123:345345'

Hi @jamesnos, you should the following to get the password you want.

from notebook.auth import passwd
passwd()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment