from pyspark.sql import functions as F df = sqlContext.createDataFrame([ (1, "a"), (2, "b"), (3, "c"), ], ["ID", "Text"]) categories = df.select("Text").distinct().rdd.flatMap(lambda x: x).collect() exprs = [F.when(F.col("Text") == category, 1).otherwise(0).alias(category) for category in categories] df.select("ID", *exprs).show()
Output
+---+---+---+---+ | ID| a| b| c| +---+---+---+---+ | 1| 1| 0| 0| | 2| 0| 1| 0| | 3| 0| 0| 1| +---+---+---+---+
source share