Skip to content

Instantly share code, notes, and snippets.

@randerzander
Last active April 15, 2021 18:45
Show Gist options
  • Save randerzander/af3d55b1ce14e1bf652d78c6a50957ea to your computer and use it in GitHub Desktop.
Save randerzander/af3d55b1ce14e1bf652d78c6a50957ea to your computer and use it in GitHub Desktop.
df = spark.createDataFrame(
[
(1, 'foo'), # create your data here, be consistent in the types.
(2, 'bar'),
],
['id', 'txt'] # add your columns label here
)
df.write.csv('test')
spark.read.csv('test').collect()
df.createOrReplaceTempView("test")
spark.sql("select sum(id) from test").collect()
spark_rapids_dir = '/spark-rapids/'
spark_cudf_jar = spark_rapids_dir+'cudf-0.18.1-cuda11.jar'
spark_rapids_jar = spark_rapids_dir+'rapids-4-spark_2.12-0.4.1.jar'
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master('local') \
.appName('spark-rapids') \
.config('spark.executor.extraClassPath', f'{spark_cudf_jar}:{spark_rapids_jar}') \
.config('spark.driver.extraClassPath', f'{spark_cudf_jar}:{spark_rapids_jar}') \
.config('spark.rapids.sql.concurrentGpuTasks', '1') \
.config('spark.executor.memory', '4G') \
.config('spark.executor.cores', '4') \
.config('spark.task.cpus', '1') \
.config('spark.rapids.memory.pinnedPool.size', '2G') \
.config('spark.locality.wait', '0s') \
.config('spark.sql.files.maxPartitionBytes', '512m') \
.config('spark.sql.shuffle.partitions', '10') \
.config('spark.plugins', 'com.nvidia.spark.SQLPlugin') \
.config('spark.rapids.sql.variableFloatAgg.enabled', 'True') \
.getOrCreate()
#don't use for local master
#.config('spark.worker.resource.gpu.discoveryScript', f'{spark_rapids_dir}/getGpusResources.sh') \
#.config('spark.executor.resource.gpu.amount', '1') \
#.config('spark.task.resource.gpu.amount', '0.25') \
mkdir /spark-rapids
cd spark-rapids
#wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.4.0/rapids-4-spark_2.12-0.4.0.jar
wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.4.1/rapids-4-spark_2.12-0.4.1.jar
wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.18.1/cudf-0.18.1-cuda11.jar
wget https://github.com/apache/spark/raw/master/examples/src/main/scripts/getGpusResources.sh
conda install -c conda-forge openjdk=8.0 pyspark=3.1.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment