From 7cc6c3e79a8cd5b96ef5ad2dbf40363ea67def8e Mon Sep 17 00:00:00 2001 From: Tyler Gannon Date: Sun, 19 May 2013 10:51:17 -0700 Subject: [PATCH 1/2] Add ability to output worksheet name in Excel Parser Configuration of excel parser can now include: :worksheet_column => {{column_name}} Resulting output has {{column_name}} field with the current worksheet name in each row. --- lib/etl/parser/excel_parser.rb | 4 +++- spec/fixtures/excel2.ctl | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/etl/parser/excel_parser.rb b/lib/etl/parser/excel_parser.rb index df23469..38dc4fa 100644 --- a/lib/etl/parser/excel_parser.rb +++ b/lib/etl/parser/excel_parser.rb @@ -4,7 +4,7 @@ module ETL class Parser class ExcelParser < ETL::Parser - attr_accessor :ignore_blank_line + attr_accessor :ignore_blank_line, :worksheet_column # Initialize the parser # * source: The Source object @@ -49,6 +49,7 @@ def each f = fields[index] row[f.name] = value end + row[worksheet_column] = raw_row.worksheet.name if worksheet_column yield row end end @@ -87,6 +88,7 @@ def configure end unless source.definition[:worksheets].nil? self.ignore_blank_line = source.definition[:ignore_blank_line] + self.worksheet_column = source.definition[:worksheet_column] source.definition[:fields].each do |options| case options diff --git a/spec/fixtures/excel2.ctl b/spec/fixtures/excel2.ctl index 31523c3..08e9099 100644 --- a/spec/fixtures/excel2.ctl +++ b/spec/fixtures/excel2.ctl @@ -11,7 +11,10 @@ source :in, { :ssn, :age, :sex - ] + ] #, + # Add worksheet column e.g. + # In case the schemas of sheets are the same but their data should be differentiable as such. + # :worksheet_column => :name_info } transform :ssn, :sha1 From eb54eabab910e28e34f1d3277d4412d0dde696ee Mon Sep 17 00:00:00 2001 From: Tyler Gannon Date: Sat, 1 Jun 2013 14:14:05 -0700 Subject: [PATCH 2/2] make row validation an option, parse XLSX files, add worksheet column for excel parser. --- lib/etl/engine.rb | 13 +++++++------ lib/etl/parser/csv_parser.rb | 10 +++++++++- lib/etl/parser/excel_parser.rb | 31 +++++++++++++++++++++++-------- spec/fixtures/data/excel2.xls | Bin 26624 -> 26624 bytes 4 files changed, 39 insertions(+), 15 deletions(-) diff --git a/lib/etl/engine.rb b/lib/etl/engine.rb index ccc7825..444dcf9 100644 --- a/lib/etl/engine.rb +++ b/lib/etl/engine.rb @@ -310,12 +310,13 @@ def process_batch(batch) def process_control(control) control = ETL::Control.resolve(control) say_on_own_line "Processing control #{control.file}" - - ETL::Engine.job = ETL::Execution::Job.create!( - :control_file => control.file, - :status => 'executing', - :batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil - ) + + ETL::Engine.job = ETL::Execution::Job.new.tap do |job| + job.control_file = control.file + job.status = 'executing' + job.batch_id = ETL::Engine.batch ? ETL::Engine.batch.id : nil + job.save! + end execute_dependencies(control) diff --git a/lib/etl/parser/csv_parser.rb b/lib/etl/parser/csv_parser.rb index 4a79bfe..cd015f8 100644 --- a/lib/etl/parser/csv_parser.rb +++ b/lib/etl/parser/csv_parser.rb @@ -9,6 +9,8 @@ def initialize(source, options={}) super configure end + + attr_reader :validate_rows def get_fields_names(file) File.open(file) do |input| @@ -43,7 +45,7 @@ def each end line += 1 row = {} - validate_row(raw_row, line, file) + validate_row(raw_row, line, file) if self.validate_rows raw_row.each_with_index do |value, index| f = fields[index] row[f.name] = value @@ -70,6 +72,12 @@ def validate_row(row, line, file) end def configure + @validate_rows = if source.configuration.has_key?(:validate_rows) + source.configuration[:validate_rows] + else + true + end + source.definition.each do |options| case options when Symbol diff --git a/lib/etl/parser/excel_parser.rb b/lib/etl/parser/excel_parser.rb index 38dc4fa..342c39c 100644 --- a/lib/etl/parser/excel_parser.rb +++ b/lib/etl/parser/excel_parser.rb @@ -1,10 +1,10 @@ -optional_require 'spreadsheet' +optional_require 'roo' module ETL class Parser class ExcelParser < ETL::Parser - attr_accessor :ignore_blank_line, :worksheet_column + attr_accessor :ignore_blank_line, :worksheet_column, :validate_rows # Initialize the parser # * source: The Source object @@ -20,19 +20,29 @@ def each ETL::Engine.logger.debug "parsing #{file}" line = 0 lines_skipped = 0 - book = Spreadsheet.open file + book = Roo::Spreadsheet.open file loopworksheets = [] if worksheets.empty? - loopworksheets = book.worksheets + loopworksheets = book.sheets else worksheets.each do |index| - loopworksheets << book.worksheet(index) + loopworksheets << book.sheet(index) end end + + sheet_index = -1 - loopworksheets.each do |sheet| + book.each_with_pagename do |name, sheet| + sheet_index += 1 + # puts "Sheet: #{name}" + # puts worksheets.inspect + if !worksheets.empty? && !worksheets.include?(sheet_index) + # puts "No!!! #{sheet_index.inspect}" + next + end sheet.each do |raw_row| + # puts raw_row.inspect if lines_skipped < source.skip_lines ETL::Engine.logger.debug "skipping line" lines_skipped += 1 @@ -44,12 +54,12 @@ def each lines_skipped += 1 next end - validate_row(raw_row, line, file) + validate_row(raw_row, line, file) if self.validate_rows raw_row.each_with_index do |value, index| f = fields[index] row[f.name] = value end - row[worksheet_column] = raw_row.worksheet.name if worksheet_column + row[worksheet_column] = name if worksheet_column yield row end end @@ -89,6 +99,11 @@ def configure self.ignore_blank_line = source.definition[:ignore_blank_line] self.worksheet_column = source.definition[:worksheet_column] + self.validate_rows = if source.configuration.has_key?(:validate_rows) + source.configuration[:validate_rows] + else + true + end source.definition[:fields].each do |options| case options diff --git a/spec/fixtures/data/excel2.xls b/spec/fixtures/data/excel2.xls index f87c08721d232d5d09239f10560bdb4596c721f1..d98fe864db7d5f2396475bfe15549c55c6c7d0ad 100644 GIT binary patch delta 48 zcmZp;z}RqsaYGIZ`vk27lbs>wHW#z(VB`s@%twa$QC|1fQ@VO47Q9! E0L2#(u>b%7 delta 48 zcmZp;z}RqsaYGIZ`zKzx|L$#PHW#z(VB`r-&MzrZaL>=HOii5Z$QC|1fQ@VO47Q9! E0NCjgJOBUy