1717
1818package org .apache .spark .sql .execution .datasources .v2
1919
20+ import scala .collection .JavaConverters ._
21+
22+ import org .apache .spark .sql .AnalysisException
2023import org .apache .spark .sql .catalyst .analysis .MultiInstanceRelation
21- import org .apache .spark .sql .catalyst .expressions .AttributeReference
22- import org .apache .spark .sql .catalyst .plans .logical .{LeafNode , Statistics }
23- import org .apache .spark .sql .sources .v2 .reader ._
24+ import org .apache .spark .sql .catalyst .expressions .{AttributeReference , Expression }
25+ import org .apache .spark .sql .catalyst .plans .QueryPlan
26+ import org .apache .spark .sql .catalyst .plans .logical .{LeafNode , LogicalPlan , Statistics }
27+ import org .apache .spark .sql .execution .datasources .DataSourceStrategy
28+ import org .apache .spark .sql .sources .{DataSourceRegister , Filter }
29+ import org .apache .spark .sql .sources .v2 .{DataSourceOptions , DataSourceV2 , ReadSupport , ReadSupportWithSchema }
30+ import org .apache .spark .sql .sources .v2 .reader .{DataSourceReader , SupportsPushDownCatalystFilters , SupportsPushDownFilters , SupportsPushDownRequiredColumns , SupportsReportStatistics }
31+ import org .apache .spark .sql .types .StructType
2432
2533case class DataSourceV2Relation (
26- output : Seq [AttributeReference ],
27- reader : DataSourceReader )
28- extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder {
34+ source : DataSourceV2 ,
35+ options : Map [String , String ],
36+ projection : Seq [AttributeReference ],
37+ filters : Option [Seq [Expression ]] = None ,
38+ userSpecifiedSchema : Option [StructType ] = None ) extends LeafNode with MultiInstanceRelation {
39+
40+ import DataSourceV2Relation ._
41+
42+ override def simpleString : String = {
43+ s " DataSourceV2Relation(source= ${source.name}, " +
44+ s " schema=[ ${output.map(a => s " $a ${a.dataType.simpleString}" ).mkString(" , " )}], " +
45+ s " filters=[ ${pushedFilters.mkString(" , " )}], options= $options) "
46+ }
47+
48+ override lazy val schema : StructType = reader.readSchema()
49+
50+ override lazy val output : Seq [AttributeReference ] = {
51+ // use the projection attributes to avoid assigning new ids. fields that are not projected
52+ // will be assigned new ids, which is okay because they are not projected.
53+ val attrMap = projection.map(a => a.name -> a).toMap
54+ schema.map(f => attrMap.getOrElse(f.name,
55+ AttributeReference (f.name, f.dataType, f.nullable, f.metadata)()))
56+ }
57+
58+ private lazy val v2Options : DataSourceOptions = makeV2Options(options)
59+
60+ lazy val (
61+ reader : DataSourceReader ,
62+ unsupportedFilters : Seq [Expression ],
63+ pushedFilters : Seq [Expression ]) = {
64+ val newReader = userSpecifiedSchema match {
65+ case Some (s) =>
66+ source.asReadSupportWithSchema.createReader(s, v2Options)
67+ case _ =>
68+ source.asReadSupport.createReader(v2Options)
69+ }
70+
71+ DataSourceV2Relation .pushRequiredColumns(newReader, projection.toStructType)
2972
30- override def canEqual (other : Any ): Boolean = other.isInstanceOf [DataSourceV2Relation ]
73+ val (remainingFilters, pushedFilters) = filters match {
74+ case Some (filterSeq) =>
75+ DataSourceV2Relation .pushFilters(newReader, filterSeq)
76+ case _ =>
77+ (Nil , Nil )
78+ }
79+
80+ (newReader, remainingFilters, pushedFilters)
81+ }
82+
83+ override def doCanonicalize (): LogicalPlan = {
84+ val c = super .doCanonicalize().asInstanceOf [DataSourceV2Relation ]
85+
86+ // override output with canonicalized output to avoid attempting to configure a reader
87+ val canonicalOutput : Seq [AttributeReference ] = this .output
88+ .map(a => QueryPlan .normalizeExprId(a, projection))
89+
90+ new DataSourceV2Relation (c.source, c.options, c.projection) {
91+ override lazy val output : Seq [AttributeReference ] = canonicalOutput
92+ }
93+ }
3194
3295 override def computeStats (): Statistics = reader match {
3396 case r : SupportsReportStatistics =>
@@ -37,22 +100,147 @@ case class DataSourceV2Relation(
37100 }
38101
39102 override def newInstance (): DataSourceV2Relation = {
40- copy(output = output.map(_.newInstance()))
103+ // projection is used to maintain id assignment.
104+ // if projection is not set, use output so the copy is not equal to the original
105+ copy(projection = projection.map(_.newInstance()))
41106 }
42107}
43108
44109/**
45110 * A specialization of DataSourceV2Relation with the streaming bit set to true. Otherwise identical
46111 * to the non-streaming relation.
47112 */
48- class StreamingDataSourceV2Relation (
113+ case class StreamingDataSourceV2Relation (
49114 output : Seq [AttributeReference ],
50- reader : DataSourceReader ) extends DataSourceV2Relation (output, reader) {
115+ reader : DataSourceReader )
116+ extends LeafNode with DataSourceReaderHolder with MultiInstanceRelation {
51117 override def isStreaming : Boolean = true
118+
119+ override def canEqual (other : Any ): Boolean = other.isInstanceOf [StreamingDataSourceV2Relation ]
120+
121+ override def newInstance (): LogicalPlan = copy(output = output.map(_.newInstance()))
122+
123+ override def computeStats (): Statistics = reader match {
124+ case r : SupportsReportStatistics =>
125+ Statistics (sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes))
126+ case _ =>
127+ Statistics (sizeInBytes = conf.defaultSizeInBytes)
128+ }
52129}
53130
54131object DataSourceV2Relation {
55- def apply (reader : DataSourceReader ): DataSourceV2Relation = {
56- new DataSourceV2Relation (reader.readSchema().toAttributes, reader)
132+ private implicit class SourceHelpers (source : DataSourceV2 ) {
133+ def asReadSupport : ReadSupport = {
134+ source match {
135+ case support : ReadSupport =>
136+ support
137+ case _ : ReadSupportWithSchema =>
138+ // this method is only called if there is no user-supplied schema. if there is no
139+ // user-supplied schema and ReadSupport was not implemented, throw a helpful exception.
140+ throw new AnalysisException (s " Data source requires a user-supplied schema: $name" )
141+ case _ =>
142+ throw new AnalysisException (s " Data source is not readable: $name" )
143+ }
144+ }
145+
146+ def asReadSupportWithSchema : ReadSupportWithSchema = {
147+ source match {
148+ case support : ReadSupportWithSchema =>
149+ support
150+ case _ : ReadSupport =>
151+ throw new AnalysisException (
152+ s " Data source does not support user-supplied schema: $name" )
153+ case _ =>
154+ throw new AnalysisException (s " Data source is not readable: $name" )
155+ }
156+ }
157+
158+ def name : String = {
159+ source match {
160+ case registered : DataSourceRegister =>
161+ registered.shortName()
162+ case _ =>
163+ source.getClass.getSimpleName
164+ }
165+ }
166+ }
167+
168+ private def makeV2Options (options : Map [String , String ]): DataSourceOptions = {
169+ new DataSourceOptions (options.asJava)
170+ }
171+
172+ private def schema (
173+ source : DataSourceV2 ,
174+ v2Options : DataSourceOptions ,
175+ userSchema : Option [StructType ]): StructType = {
176+ val reader = userSchema match {
177+ // TODO: remove this case because it is confusing for users
178+ case Some (s) if ! source.isInstanceOf [ReadSupportWithSchema ] =>
179+ val reader = source.asReadSupport.createReader(v2Options)
180+ if (reader.readSchema() != s) {
181+ throw new AnalysisException (s " ${source.name} does not allow user-specified schemas. " )
182+ }
183+ reader
184+ case Some (s) =>
185+ source.asReadSupportWithSchema.createReader(s, v2Options)
186+ case _ =>
187+ source.asReadSupport.createReader(v2Options)
188+ }
189+ reader.readSchema()
190+ }
191+
192+ def create (
193+ source : DataSourceV2 ,
194+ options : Map [String , String ],
195+ filters : Option [Seq [Expression ]] = None ,
196+ userSpecifiedSchema : Option [StructType ] = None ): DataSourceV2Relation = {
197+ val projection = schema(source, makeV2Options(options), userSpecifiedSchema).toAttributes
198+ DataSourceV2Relation (source, options, projection, filters,
199+ // if the source does not implement ReadSupportWithSchema, then the userSpecifiedSchema must
200+ // be equal to the reader's schema. the schema method enforces this. because the user schema
201+ // and the reader's schema are identical, drop the user schema.
202+ if (source.isInstanceOf [ReadSupportWithSchema ]) userSpecifiedSchema else None )
203+ }
204+
205+ private def pushRequiredColumns (reader : DataSourceReader , struct : StructType ): Unit = {
206+ reader match {
207+ case projectionSupport : SupportsPushDownRequiredColumns =>
208+ projectionSupport.pruneColumns(struct)
209+ case _ =>
210+ }
211+ }
212+
213+ private def pushFilters (
214+ reader : DataSourceReader ,
215+ filters : Seq [Expression ]): (Seq [Expression ], Seq [Expression ]) = {
216+ reader match {
217+ case catalystFilterSupport : SupportsPushDownCatalystFilters =>
218+ (
219+ catalystFilterSupport.pushCatalystFilters(filters.toArray),
220+ catalystFilterSupport.pushedCatalystFilters()
221+ )
222+
223+ case filterSupport : SupportsPushDownFilters =>
224+ // A map from original Catalyst expressions to corresponding translated data source
225+ // filters. If a predicate is not in this map, it means it cannot be pushed down.
226+ val translatedMap : Map [Expression , Filter ] = filters.flatMap { p =>
227+ DataSourceStrategy .translateFilter(p).map(f => p -> f)
228+ }.toMap
229+
230+ // Catalyst predicate expressions that cannot be converted to data source filters.
231+ val nonConvertiblePredicates = filters.filterNot(translatedMap.contains)
232+
233+ // Data source filters that cannot be pushed down. An unhandled filter means
234+ // the data source cannot guarantee the rows returned can pass the filter.
235+ // As a result we must return it so Spark can plan an extra filter operator.
236+ val unhandledFilters = filterSupport.pushFilters(translatedMap.values.toArray).toSet
237+ val (unhandledPredicates, pushedPredicates) = translatedMap.partition { case (_, f) =>
238+ unhandledFilters.contains(f)
239+ }
240+
241+ (nonConvertiblePredicates ++ unhandledPredicates.keys, pushedPredicates.keys.toSeq)
242+
243+ case _ => (filters, Nil )
244+ }
57245 }
58246}
0 commit comments