1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
scala> val merch_cat_dim_RDD = sc.textFile("/user/hduser/MERCH_CAT_DIM.csv").filter(line
| => line.split(",")(0).forall(_.isDigit)).map(line => line.split(","))
15/09/06 11:52:45 INFO MemoryStore: ensureFreeSpace(244456) called with curMem=0, maxMem=1111794647
15/09/06 11:52:45 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 238.7 KB, free 1060.1 MB)
15/09/06 11:52:46 INFO MemoryStore: ensureFreeSpace(20627) called with curMem=244456, maxMem=1111794647
15/09/06 11:52:46 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 20.1 KB, free 1060.0 MB)
15/09/06 11:52:46 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.0.11:57011 (size: 20.1 KB, free: 1060.3 MB)
15/09/06 11:52:46 INFO SparkContext: Created broadcast 0 from textFile at <console>:21
merch_cat_dim_RDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[3] at map at <console>:22
scala> val merch_dim_RDD = sc.textFile("/user/hduser/MERCH_DIM.csv").filter(line =>
| line.split(",")(0).forall(_.isDigit)).map(line => line.split(","))
15/09/06 11:52:46 INFO MemoryStore: ensureFreeSpace(244496) called with curMem=265083, maxMem=1111794647
15/09/06 11:52:46 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 238.8 KB, free 1059.8 MB)
15/09/06 11:52:46 INFO MemoryStore: ensureFreeSpace(20627) called with curMem=509579, maxMem=1111794647
15/09/06 11:52:46 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 20.1 KB, free 1059.8 MB)
15/09/06 11:52:46 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.0.11:57011 (size: 20.1 KB, free: 1060.3 MB)
15/09/06 11:52:46 INFO SparkContext: Created broadcast 1 from textFile at <console>:21
merch_dim_RDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[7] at map at <console>:22
scala> val merch_dim_cat_map_RDD = merch_dim_RDD.map(values => (values(3),values))
merch_dim_cat_map_RDD: org.apache.spark.rdd.RDD[(String, Array[String])] = MapPartitionsRDD[8] at map at <console>:23
scala> val merch_cat_map_RDD = merch_cat_dim_RDD.map( values => (values(0),values))
merch_cat_map_RDD: org.apache.spark.rdd.RDD[(String, Array[String])] = MapPartitionsRDD[9] at map at <console>:23
scala>
scala> val joinedRDD = merch_dim_cat_map_RDD.join(merch_cat_map_RDD)
15/09/06 11:52:47 INFO FileInputFormat: Total input paths to process : 1
15/09/06 11:52:47 INFO FileInputFormat: Total input paths to process : 1
joinedRDD: org.apache.spark.rdd.RDD[(String, (Array[String], Array[String]))] = MapPartitionsRDD[12] at join at <console>:29
scala> val map_cat_names_RDD = joinedRDD.map(k => (k._2._2(1),1))
map_cat_names_RDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[13] at map at <console>:31
scala> map_cat_names_RDD.count()
15/09/06 11:52:49 INFO SparkContext: Starting job: count at <console>:34
15/09/06 11:52:49 INFO DAGScheduler: Registering RDD 8 (map at <console>:23)
15/09/06 11:52:49 INFO DAGScheduler: Registering RDD 9 (map at <console>:23)
15/09/06 11:52:49 INFO DAGScheduler: Got job 0 (count at <console>:34) with 2 output partitions (allowLocal=false)
15/09/06 11:52:49 INFO DAGScheduler: Final stage: ResultStage 2(count at <console>:34)
15/09/06 11:52:49 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 0, ShuffleMapStage 1)
15/09/06 11:52:49 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 0, ShuffleMapStage 1)
15/09/06 11:52:49 INFO DAGScheduler: Submitting ShuffleMapStage 0 (MapPartitionsRDD[8] at map at <console>:23), which has no missing parents
15/09/06 11:52:49 INFO MemoryStore: ensureFreeSpace(3952) called with curMem=530206, maxMem=1111794647
15/09/06 11:52:49 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 3.9 KB, free 1059.8 MB)
15/09/06 11:52:49 INFO MemoryStore: ensureFreeSpace(2188) called with curMem=534158, maxMem=1111794647
15/09/06 11:52:49 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.1 KB, free 1059.8 MB)
15/09/06 11:52:49 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.0.11:57011 (size: 2.1 KB, free: 1060.2 MB)
15/09/06 11:52:49 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:874
15/09/06 11:52:49 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 0 (MapPartitionsRDD[8] at map at <console>:23)
15/09/06 11:52:49 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
15/09/06 11:52:49 INFO DAGScheduler: Submitting ShuffleMapStage 1 (MapPartitionsRDD[9] at map at <console>:23), which has no missing parents
15/09/06 11:52:49 INFO MemoryStore: ensureFreeSpace(3960) called with curMem=536346, maxMem=1111794647
15/09/06 11:52:49 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 3.9 KB, free 1059.8 MB)
15/09/06 11:52:49 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, 192.168.0.11, ANY, 1412 bytes)
15/09/06 11:52:49 INFO MemoryStore: ensureFreeSpace(2194) called with curMem=540306, maxMem=1111794647
15/09/06 11:52:49 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 2.1 KB, free 1059.8 MB)
15/09/06 11:52:49 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, 192.168.0.31, ANY, 1412 bytes)
15/09/06 11:52:49 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on 192.168.0.11:57011 (size: 2.1 KB, free: 1060.2 MB)
15/09/06 11:52:49 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:874
15/09/06 11:52:49 INFO DAGScheduler: Submitting 2 missing tasks from ShuffleMapStage 1 (MapPartitionsRDD[9] at map at <console>:23)
15/09/06 11:52:49 INFO TaskSchedulerImpl: Adding task set 1.0 with 2 tasks
15/09/06 11:52:49 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 2, 192.168.0.11, ANY, 1416 bytes)
15/09/06 11:52:49 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 3, 192.168.0.31, ANY, 1416 bytes)
15/09/06 11:52:50 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on 192.168.0.11:55594 (size: 2.1 KB, free: 1589.8 MB)
15/09/06 11:52:50 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.0.11:55594 (size: 2.1 KB, free: 1589.8 MB)
15/09/06 11:52:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.0.11:55594 (size: 20.1 KB, free: 1589.7 MB)
15/09/06 11:52:50 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.0.31:49446 (size: 2.1 KB, free: 1589.8 MB)
15/09/06 11:52:50 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.0.11:55594 (size: 20.1 KB, free: 1589.7 MB)
15/09/06 11:52:51 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on 192.168.0.31:49446 (size: 2.1 KB, free: 1589.8 MB)
15/09/06 11:52:51 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.0.31:49446 (size: 20.1 KB, free: 1589.7 MB)
15/09/06 11:52:51 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 2) in 1595 ms on 192.168.0.11 (1/2)
15/09/06 11:52:51 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 1655 ms on 192.168.0.11 (1/2)
15/09/06 11:52:51 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.0.31:49446 (size: 20.1 KB, free: 1589.7 MB)
15/09/06 11:52:54 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 3) in 5098 ms on 192.168.0.31 (2/2)
15/09/06 11:52:54 INFO DAGScheduler: ShuffleMapStage 1 (map at <console>:23) finished in 5,101 s
15/09/06 11:52:54 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
15/09/06 11:52:54 INFO DAGScheduler: looking for newly runnable stages
15/09/06 11:52:54 INFO DAGScheduler: running: Set(ShuffleMapStage 0)
15/09/06 11:52:54 INFO DAGScheduler: waiting: Set(ResultStage 2)
15/09/06 11:52:54 INFO DAGScheduler: failed: Set()
15/09/06 11:52:54 INFO DAGScheduler: Missing parents for ResultStage 2: List(ShuffleMapStage 0)
15/09/06 11:52:55 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 5334 ms on 192.168.0.31 (2/2)
15/09/06 11:52:55 INFO DAGScheduler: ShuffleMapStage 0 (map at <console>:23) finished in 5,347 s
15/09/06 11:52:55 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/09/06 11:52:55 INFO DAGScheduler: looking for newly runnable stages
15/09/06 11:52:55 INFO DAGScheduler: running: Set()
15/09/06 11:52:55 INFO DAGScheduler: waiting: Set(ResultStage 2)
15/09/06 11:52:55 INFO DAGScheduler: failed: Set()
15/09/06 11:52:55 INFO DAGScheduler: Missing parents for ResultStage 2: List()
15/09/06 11:52:55 INFO DAGScheduler: Submitting ResultStage 2 (MapPartitionsRDD[13] at map at <console>:31), which is now runnable
15/09/06 11:52:55 INFO MemoryStore: ensureFreeSpace(2856) called with curMem=542500, maxMem=1111794647
15/09/06 11:52:55 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 2.8 KB, free 1059.8 MB)
15/09/06 11:52:55 INFO MemoryStore: ensureFreeSpace(1551) called with curMem=545356, maxMem=1111794647
15/09/06 11:52:55 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 1551.0 B, free 1059.8 MB)
15/09/06 11:52:55 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.0.11:57011 (size: 1551.0 B, free: 1060.2 MB)
15/09/06 11:52:55 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:874
15/09/06 11:52:55 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 2 (MapPartitionsRDD[13] at map at <console>:31)
15/09/06 11:52:55 INFO TaskSchedulerImpl: Adding task set 2.0 with 2 tasks
15/09/06 11:52:55 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 4, 192.168.0.31, PROCESS_LOCAL, 1238 bytes)
15/09/06 11:52:55 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 5, 192.168.0.32, PROCESS_LOCAL, 1238 bytes)
15/09/06 11:52:55 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.0.31:49446 (size: 1551.0 B, free: 1589.7 MB)
15/09/06 11:52:55 INFO MapOutputTrackerMasterEndpoint: Asked to send map output locations for shuffle 0 to 192.168.0.31:46073
15/09/06 11:52:55 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 163 bytes
15/09/06 11:52:55 INFO MapOutputTrackerMasterEndpoint: Asked to send map output locations for shuffle 1 to 192.168.0.31:46073
15/09/06 11:52:55 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 163 bytes
15/09/06 11:52:55 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.0.32:58863 (size: 1551.0 B, free: 1589.8 MB)
15/09/06 11:52:55 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 4) in 672 ms on 192.168.0.31 (1/2)
15/09/06 11:52:56 INFO MapOutputTrackerMasterEndpoint: Asked to send map output locations for shuffle 0 to 192.168.0.32:43956
15/09/06 11:52:56 INFO MapOutputTrackerMasterEndpoint: Asked to send map output locations for shuffle 1 to 192.168.0.32:43956
15/09/06 11:52:57 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 5) in 2735 ms on 192.168.0.32 (2/2)
15/09/06 11:52:57 INFO DAGScheduler: ResultStage 2 (count at <console>:34) finished in 2,739 s
15/09/06 11:52:57 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
15/09/06 11:52:57 INFO DAGScheduler: Job 0 finished: count at <console>:34, took 8,186017 s
res0: Long = 2000
scala> |
Partager