Skip to content

Instantly share code, notes, and snippets.

@diogobaltazar
Last active December 16, 2019 14:46
Show Gist options
  • Save diogobaltazar/ee92bf13388c4467331365e16af89dcf to your computer and use it in GitHub Desktop.
Save diogobaltazar/ee92bf13388c4467331365e16af89dcf to your computer and use it in GitHub Desktop.
Transform columns with condition on rows
> df = spark.createDataFrame(
[(1, 0), (3, 0)],
("a", "b")
)
> transf_column(df, F.col('a') + F.col('a'), 'a').show()
+---+---+
| a| b|
+---+---+
| 2| 0|
| 6| 0|
+---+---+
def transf_dataset(dataset, transform, col_name, filtering_condition = False):
new = col_name + '_new'
condition_ok = dataset.filter(filtering_condition)
dataset = (
dataset
.filter(~filtering_condition)
.union(
condition_ok
# apply transf
.withColumn(new, transform)
# loose old column, rename new col to old col name
.select(list(
filter(lambda _: _ != col_name, dataset.columns)
) + [new])
.withColumnRenamed(new, col_name)
# re-order cols
.select(dataset.columns)
)
)
return dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment