value
def tearDownClass(cls
appName("PySparkTesting
self.assertTrue
master("local[2
@classmethod
expected_df = self.spark.createDataFrame
cls.spark = SparkSession.builder
@classmethod
def test_add_doubled_column(self
getOrCreate
class PySparkTest(unittest.TestCase
def add_doubled_column(df, column_name
1,), (2,), (3
expected_data
result_df.collect() == expected_df.collect
expected_data = [(1, 2), (2, 4), (3, 6
cls.spark.stop
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
return df.withColumn("doubled", col(column_name) * 2
import unittest
result_df = add_doubled_column(df, "value
if __name__ == "__main__
df = self.spark.createDataFrame
def setUpClass(cls
value", "doubled
unittest.main
Scenario
1
:
Unit
Testing
a
Data
Transformation
Function
in
PySpark
Problem
:
You
have
developed
a
function
that
transforms
a
DataFrame
by
adding
a
new
column
which
doubles
the
values
of
an
existing
column
.
You
need
to
write
a
unit
test
to
validate
that
this
function
performs
as
expected
.
Solution
:
import
unittest
from
pyspark
.
sql
import
SparkSession
from
pyspark
.
sql
.
functions
import
col
def
add_doubled_column
(
df
,
column_name
)
:
return
df
.
withColumn
(
"
doubled
"
,
col
(
column_name
)
*
2
)
class
PySparkTest
(
unittest
.
TestCase
)
:
@classmethod
def
setUpClass
(
cls
)
:
cls
.
spark
=
SparkSession
.
builder
\
.
appName
(
"
PySparkTesting
"
)
\
.
master
(
"
local
[
2
]
"
)
\
.
getOrCreate
(
)
@classmethod
def
tearDownClass
(
cls
)
:
cls
.
spark
.
stop
(
)
def
test_add_doubled_column
(
self
)
:
#
Create
a
sample
DataFrame
df
=
self
.
spark
.
createDataFrame
(
[
(
1
,
)
,
(
2
,
)
,
(
3
,
)
]
,
[
"
value
"
]
)
#
Apply
the
transformation
result_df
=
add_doubled_column
(
df
,
"
value
"
)
#
Expected
DataFrame
expected_data
=
[
(
1
,
2
)
,
(
2
,
4
)
,
(
3
,
6
)
]
expected_df
=
self
.
spark
.
createDataFrame
(
expected_data
,
[
"
value
"
,
"
doubled
"
]
)
#
Assert
that
the
transformed
DataFrame
matches
the
expected
DataFrame
self
.
assertTrue
(
result_df
.
collect
(
)
=
=
expected_df
.
collect
(
)
,
"
The
transformed
DataFrame
does
not
match
the
expected
DataFrame
.
"
)
#
Run
the
tests
if
__name__
=
=
"
__main__
"
:
unittest
.
main
(
)
-
-
-
____________________
____________________
____________________
____________________
)
:
____________________
)
____________________
)
:
____________________
____________________
)
:
____________________
\
.
____________________
"
)
\
.
____________________
]
"
)
\
.
____________________
(
)
____________________
____________________
)
:
____________________
(
)
____________________
)
:
#
Create
a
sample
DataFrame
____________________
(
[
(
____________________
,
)
]
,
[
"
____________________
"
]
)
#
Apply
the
transformation
____________________
"
)
#
Expected
DataFrame
____________________
)
]
____________________
(
____________________
,
[
"
____________________
"
]
)
#
Assert
that
the
transformed
DataFrame
matches
the
expected
DataFrame
____________________
(
____________________
(
)
,
"
The
transformed
DataFrame
does
not
match
the
expected
DataFrame
.
"
)
#
Run
the
tests
____________________
"
:
____________________
(
)
Explanation
:
Function
Definition
:
add_doubled_column
takes
a
DataFrame
and
a
column
name
,
then
returns
the
DataFrame
with
an
additional
column
where
the
values
are
doubled
.
Unit
Test
Setup
:
Using
unittest
framework
to
structure
PySpark
tests
,
with
setUpClass
and
tearDownClass
for
initializing
and
stopping
the
Spark
session
.
Test
Method
:
test_add_doubled_column
creates
a
sample
DataFrame
,
applies
the
transformation
,
and
compares
the
result
with
an
expected
DataFrame
to
ensure
the
function
works
correctly
.
Assertions
:
Uses
assertTrue
with
a
condition
that
checks
if
the
collected
data
from
the
result
DataFrame
matches
the
expected
DataFrame
,
providing
clear
feedback
if
the
test
fails
.
This
unit
testing
approach
is
a
fundamental
part
of
developing
reliable
PySpark
applications
,
allowing
you
to
validate
transformations
and
logic
independently
of
the
complete
ETL
pipeline
.