3
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.cassandra.CassandraSQLContext
object Test {
  val sparkConf = new SparkConf(true).set("spark.cassandra.connection.host", <Cassandra Server IP>)
  val sc = new SparkContext(sparkConf)
  val cassandraSQLContext = new CassandraSQLContext(sc)
  val numberAsString = cassandraSQLContext.sql("select * from testing.test").first().getAs[Int]("number").toString()
  val testRDD = sc.parallelize(List(0, 0))
  val newRDD = testRDD.map { x => numberAsString }
}

This is the code that I have written in Spark. I was expecting it to work properly because i am evaluating the numberAsString value and then using it in the map function but it gives me task not serializable error. I am running the job in local mode.

Error on spark-shell

Error on spark-shell

Dawny33
  • 8,476
  • 12
  • 49
  • 106
Credosam
  • 81
  • 1
  • 10

3 Answers3

3

Looking at another question on Stack Overflow about serialization exceptions in Spark, it says that anonymous functions serialize their containing class, and if that class contains the SparkContext -- which is not serializable -- then an error is thrown.

Perhaps this is happening to you?

PriceHardman
  • 216
  • 1
  • 4
3

I changed my code to this:

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.cassandra.CassandraSQLContext
import org.apache.spark.rdd.RDD
object Test2 {
  def calculate(numberAsString: String, testRDD: RDD[Int]): RDD[String] = {
    val newRDD = testRDD.map { x => numberAsString }
    newRDD
  }
}

object Test {
  val sparkConf = new SparkConf(true).set("spark.cassandra.connection.host", <Cassandra Server IP>)
  val sc = new SparkContext(sparkConf)
  val cassandraSQLContext = new CassandraSQLContext(sc)
  val numberAsString = cassandraSQLContext.sql("select * from hdfc.test").first().getAs[Int]("number").toString()
  val testRDD = sc.parallelize(List(0, 0))
  val newRDD = Test2.calculate(numberAsString, testRDD)
}

Now when it will go to serialize the containing class then it will get serialized.

Credosam
  • 81
  • 1
  • 10
0

You should broadcast your variable.

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.cassandra.CassandraSQLContext
object Test {
  val sparkConf = new SparkConf(true).set("spark.cassandra.connection.host", <Cassandra Server IP>)
  val sc = new SparkContext(sparkConf)
  val cassandraSQLContext = new CassandraSQLContext(sc)
  val numberAsString = cassandraSQLContext.sql("select * from testing.test").first().getAs[Int]("number").toString()
  val numberAsStringBC = sc.broadcast(numberAsString)
  val testRDD = sc.parallelize(List(0, 0))
  val newRDD = testRDD.map { x => numberAsStringBC.value }
}