from pyspark.sql.functions import col, lit
# File location and type
file_location = "/FileStore/tables/sales_data_part1.csv"
file_type = "csv"
# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","
# The applied options are for CSV files. For other file types, these will
# be ignored.
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
display(df)
# Adding new column with and with default values.
# Remember to import lit function from from
#pyspark.sql.functions
# Following code will add new column named
#COntinet with default value of North America
df2 = df.withColumn("Continent", lit("North America"))
df2.display()
# Adding new column based on existing row values
# Following code till add new column TotalPrice
# by multiplying Quantity and UnitPrice
df3 = df.withColumn("TotalPrice", col("Quantity")* col("UnitPrice"))
df3.display()
#Adding multiple columns
# Following code will add 2 columns,
# Total price = Quantity * UnitPrice and
# Region with default value as India
df4 = df.withColumn("TotalPrice", col("Quantity")* col("UnitPrice")).withColumn("Region", lit("India"))
df4.display()
#Adding column using SELECT
# Following code will create new DF with single column named Region and assigned
# default value of "India" to all its null values.
df5=df.select(lit("India").alias("Region"))
df5.display();
# To all all colums, you can use following code
df6=df.select(col("InvoiceNo"), col("StockCode"), col("Description"), col("Quantity"), col("InvoiceDate"), col("UnitPrice"), col("CustomerID"), col("Country"),lit("India").alias("Region"))
df6.display()