Spark - Need help in locating the files and accessing after upgrade

apache-spark

#1

Hi, Where are these files (eg: /public/retail_db/*) moved after the upgrade. how do I access them?

scala> val orders = sc.textFile("/public/retail_db/order")
18/08/07 09:48:14 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 361.3 KB, free 510.8 MB)
18/08/07 09:48:14 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 30.8 KB, free 510.7 MB)
18/08/07 09:48:14 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 172.16.1.109:37654 (size: 30.8 KB, free: 511.1 MB)
18/08/07 09:48:14 INFO SparkContext: Created broadcast 0 from textFile at :27
orders: org.apache.spark.rdd.RDD[String] = /public/retail_db/order MapPartitionsRDD[1] at textFile at :27

scala> orders.first
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://nn01.itversity.com:8020/public/retail_db/order
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1314)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.take(RDD.scala:1309)
at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1349)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.first(RDD.scala:1348)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:30)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:37)
at $iwC$$iwC$$iwC$$iwC$$iwC.(:39)
at $iwC$$iwC$$iwC$$iwC.(:41)
at $iwC$$iwC$$iwC.(:43)
at $iwC$$iwC.(:45)
at $iwC.(:47)
at (:49)
at .(:53)
at .()
at .(:7)
at .()
at $print()
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:750)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

scala> val orders = sc.textFile(“http://gw02.itversity.com:8080/public/retail_db/order”)
18/08/07 09:49:52 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 361.4 KB, free 510.4 MB)
18/08/07 09:49:52 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 30.8 KB, free 510.4 MB)
18/08/07 09:49:52 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 172.16.1.109:37654 (size: 30.8 KB, free: 511.1 MB)
18/08/07 09:49:52 INFO SparkContext: Created broadcast 1 from textFile at :27
orders: org.apache.spark.rdd.RDD[String] = http://gw02.itversity.com:8080/public/retail_db/order MapPartitionsRDD[3] at textFile at :27

scala> orders.first
java.io.IOException: No FileSystem for scheme: http
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2787)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2794)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:99)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2830)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2812)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:390)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:258)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:240)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:242)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:240)
at scala.Option.getOrElse(Option.scala:120)


#2

@unirkr

There is a typo in reading the file its orders table not order

hadoop fs -ls /public/retail_db/orders


#3

My bad. Thank you Sunil.


#4