|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.execution.datasources |
19 | 19 |
|
20 | | -import org.apache.spark.{Logging, TaskContext} |
21 | 20 | import org.apache.spark.deploy.SparkHadoopUtil |
22 | 21 | import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} |
23 | 22 | import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala |
24 | | -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} |
25 | 23 | import org.apache.spark.sql.catalyst.expressions._ |
26 | 24 | import org.apache.spark.sql.catalyst.planning.PhysicalOperation |
27 | 25 | import org.apache.spark.sql.catalyst.plans.logical |
28 | 26 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan |
| 27 | +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} |
| 28 | +import org.apache.spark.sql.execution.datasources.json.JSONRelation |
29 | 29 | import org.apache.spark.sql.sources._ |
30 | 30 | import org.apache.spark.sql.types.{StringType, StructType} |
31 | 31 | import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} |
32 | 32 | import org.apache.spark.unsafe.types.UTF8String |
33 | 33 | import org.apache.spark.util.{SerializableConfiguration, Utils} |
| 34 | +import org.apache.spark.{Logging, TaskContext} |
34 | 35 |
|
35 | 36 | /** |
36 | 37 | * A Strategy for planning scans over data sources defined using the sources API. |
@@ -61,6 +62,20 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { |
61 | 62 | // Scanning partitioned HadoopFsRelation |
62 | 63 | case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) |
63 | 64 | if t.partitionSpec.partitionColumns.nonEmpty => |
| 65 | + // One characteristics of JSONRelation is that, after updating data within the input folder, |
| 66 | + // users don't need to refresh the relation manually to read the most recent data. This is |
| 67 | + // a feature inherited from the old version of JSONRelation (the one before migrating to |
| 68 | + // HadoopFsRelation). However, normal HadoopFsRelations don't share this characteristic. |
| 69 | + // Here we specialize JSONRelation to do the refresh manually. |
| 70 | + // |
| 71 | + // Note that we can't do refreshing in JSONRelation.buildScan, because buildScan is invoked |
| 72 | + // for each individual partition. |
| 73 | + // |
| 74 | + // Please refer to SPARK-10289 and SPARK-9743 for more details. |
| 75 | + if (t.isInstanceOf[JSONRelation]) { |
| 76 | + t.refresh() |
| 77 | + } |
| 78 | + |
64 | 79 | val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray |
65 | 80 |
|
66 | 81 | logInfo { |
@@ -88,6 +103,20 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { |
88 | 103 |
|
89 | 104 | // Scanning non-partitioned HadoopFsRelation |
90 | 105 | case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) => |
| 106 | + // One characteristics of JSONRelation is that, after updating data within the input folder, |
| 107 | + // users don't need to refresh the relation manually to read the most recent data. This is |
| 108 | + // a feature inherited from the old version of JSONRelation (the one before migrating to |
| 109 | + // HadoopFsRelation). However, normal HadoopFsRelations don't share this characteristic. |
| 110 | + // Here we specialize JSONRelation to do the refresh manually. |
| 111 | + // |
| 112 | + // Note that we can't do refreshing in JSONRelation.buildScan, because buildScan is invoked |
| 113 | + // for each individual partition. |
| 114 | + // |
| 115 | + // Please refer to SPARK-10289 and SPARK-9743 for more details. |
| 116 | + if (t.isInstanceOf[JSONRelation]) { |
| 117 | + t.refresh() |
| 118 | + } |
| 119 | + |
91 | 120 | // See buildPartitionedTableScan for the reason that we need to create a shard |
92 | 121 | // broadcast HadoopConf. |
93 | 122 | val sharedHadoopConf = SparkHadoopUtil.get.conf |
|
0 commit comments