1 |
|
#!/usr/local/bin/ruby -w
|
2 |
|
|
3 |
|
# = faster_csv.rb -- Faster CSV Reading and Writing
|
4 |
|
#
|
5 |
|
# Created by James Edward Gray II on 2005-10-31.
|
6 |
|
# Copyright 2005 Gray Productions. All rights reserved.
|
7 |
|
#
|
8 |
|
# See FasterCSV for documentation.
|
9 |
|
|
10 |
|
if RUBY_VERSION >= "1.9"
|
11 |
|
abort <<-VERSION_WARNING.gsub(/^\s+/, "")
|
12 |
|
Please switch to Ruby 1.9's standard CSV library. It's FasterCSV plus
|
13 |
|
support for Ruby 1.9's m17n encoding engine.
|
14 |
|
VERSION_WARNING
|
15 |
|
end
|
16 |
|
|
17 |
|
require "forwardable"
|
18 |
|
require "English"
|
19 |
|
require "enumerator"
|
20 |
|
require "date"
|
21 |
|
require "stringio"
|
22 |
|
|
23 |
|
#
|
24 |
|
# This class provides a complete interface to CSV files and data. It offers
|
25 |
|
# tools to enable you to read and write to and from Strings or IO objects, as
|
26 |
|
# needed.
|
27 |
|
#
|
28 |
|
# == Reading
|
29 |
|
#
|
30 |
|
# === From a File
|
31 |
|
#
|
32 |
|
# ==== A Line at a Time
|
33 |
|
#
|
34 |
|
# FasterCSV.foreach("path/to/file.csv") do |row|
|
35 |
|
# # use row here...
|
36 |
|
# end
|
37 |
|
#
|
38 |
|
# ==== All at Once
|
39 |
|
#
|
40 |
|
# arr_of_arrs = FasterCSV.read("path/to/file.csv")
|
41 |
|
#
|
42 |
|
# === From a String
|
43 |
|
#
|
44 |
|
# ==== A Line at a Time
|
45 |
|
#
|
46 |
|
# FasterCSV.parse("CSV,data,String") do |row|
|
47 |
|
# # use row here...
|
48 |
|
# end
|
49 |
|
#
|
50 |
|
# ==== All at Once
|
51 |
|
#
|
52 |
|
# arr_of_arrs = FasterCSV.parse("CSV,data,String")
|
53 |
|
#
|
54 |
|
# == Writing
|
55 |
|
#
|
56 |
|
# === To a File
|
57 |
|
#
|
58 |
|
# FasterCSV.open("path/to/file.csv", "w") do |csv|
|
59 |
|
# csv << ["row", "of", "CSV", "data"]
|
60 |
|
# csv << ["another", "row"]
|
61 |
|
# # ...
|
62 |
|
# end
|
63 |
|
#
|
64 |
|
# === To a String
|
65 |
|
#
|
66 |
|
# csv_string = FasterCSV.generate do |csv|
|
67 |
|
# csv << ["row", "of", "CSV", "data"]
|
68 |
|
# csv << ["another", "row"]
|
69 |
|
# # ...
|
70 |
|
# end
|
71 |
|
#
|
72 |
|
# == Convert a Single Line
|
73 |
|
#
|
74 |
|
# csv_string = ["CSV", "data"].to_csv # to CSV
|
75 |
|
# csv_array = "CSV,String".parse_csv # from CSV
|
76 |
|
#
|
77 |
|
# == Shortcut Interface
|
78 |
|
#
|
79 |
|
# FCSV { |csv_out| csv_out << %w{my data here} } # to $stdout
|
80 |
|
# FCSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String
|
81 |
|
# FCSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr
|
82 |
|
#
|
83 |
|
class FasterCSV
|
84 |
|
# The version of the installed library.
|
85 |
|
VERSION = "1.5.0".freeze
|
86 |
|
|
87 |
|
#
|
88 |
|
# A FasterCSV::Row is part Array and part Hash. It retains an order for the
|
89 |
|
# fields and allows duplicates just as an Array would, but also allows you to
|
90 |
|
# access fields by name just as you could if they were in a Hash.
|
91 |
|
#
|
92 |
|
# All rows returned by FasterCSV will be constructed from this class, if
|
93 |
|
# header row processing is activated.
|
94 |
|
#
|
95 |
|
class Row
|
96 |
|
#
|
97 |
|
# Construct a new FasterCSV::Row from +headers+ and +fields+, which are
|
98 |
|
# expected to be Arrays. If one Array is shorter than the other, it will be
|
99 |
|
# padded with +nil+ objects.
|
100 |
|
#
|
101 |
|
# The optional +header_row+ parameter can be set to +true+ to indicate, via
|
102 |
|
# FasterCSV::Row.header_row?() and FasterCSV::Row.field_row?(), that this is
|
103 |
|
# a header row. Otherwise, the row is assumes to be a field row.
|
104 |
|
#
|
105 |
|
# A FasterCSV::Row object supports the following Array methods through
|
106 |
|
# delegation:
|
107 |
|
#
|
108 |
|
# * empty?()
|
109 |
|
# * length()
|
110 |
|
# * size()
|
111 |
|
#
|
112 |
|
def initialize(headers, fields, header_row = false)
|
113 |
|
@header_row = header_row
|
114 |
|
|
115 |
|
# handle extra headers or fields
|
116 |
|
@row = if headers.size > fields.size
|
117 |
|
headers.zip(fields)
|
118 |
|
else
|
119 |
|
fields.zip(headers).map { |pair| pair.reverse }
|
120 |
|
end
|
121 |
|
end
|
122 |
|
|
123 |
|
# Internal data format used to compare equality.
|
124 |
|
attr_reader :row
|
125 |
|
protected :row
|
126 |
|
|
127 |
|
### Array Delegation ###
|
128 |
|
|
129 |
|
extend Forwardable
|
130 |
|
def_delegators :@row, :empty?, :length, :size
|
131 |
|
|
132 |
|
# Returns +true+ if this is a header row.
|
133 |
|
def header_row?
|
134 |
|
@header_row
|
135 |
|
end
|
136 |
|
|
137 |
|
# Returns +true+ if this is a field row.
|
138 |
|
def field_row?
|
139 |
|
not header_row?
|
140 |
|
end
|
141 |
|
|
142 |
|
# Returns the headers of this row.
|
143 |
|
def headers
|
144 |
|
@row.map { |pair| pair.first }
|
145 |
|
end
|
146 |
|
|
147 |
|
#
|
148 |
|
# :call-seq:
|
149 |
|
# field( header )
|
150 |
|
# field( header, offset )
|
151 |
|
# field( index )
|
152 |
|
#
|
153 |
|
# This method will fetch the field value by +header+ or +index+. If a field
|
154 |
|
# is not found, +nil+ is returned.
|
155 |
|
#
|
156 |
|
# When provided, +offset+ ensures that a header match occurrs on or later
|
157 |
|
# than the +offset+ index. You can use this to find duplicate headers,
|
158 |
|
# without resorting to hard-coding exact indices.
|
159 |
|
#
|
160 |
|
def field(header_or_index, minimum_index = 0)
|
161 |
|
# locate the pair
|
162 |
|
finder = header_or_index.is_a?(Integer) ? :[] : :assoc
|
163 |
|
pair = @row[minimum_index..-1].send(finder, header_or_index)
|
164 |
|
|
165 |
|
# return the field if we have a pair
|
166 |
|
pair.nil? ? nil : pair.last
|
167 |
|
end
|
168 |
|
alias_method :[], :field
|
169 |
|
|
170 |
|
#
|
171 |
|
# :call-seq:
|
172 |
|
# []=( header, value )
|
173 |
|
# []=( header, offset, value )
|
174 |
|
# []=( index, value )
|
175 |
|
#
|
176 |
|
# Looks up the field by the semantics described in FasterCSV::Row.field()
|
177 |
|
# and assigns the +value+.
|
178 |
|
#
|
179 |
|
# Assigning past the end of the row with an index will set all pairs between
|
180 |
|
# to <tt>[nil, nil]</tt>. Assigning to an unused header appends the new
|
181 |
|
# pair.
|
182 |
|
#
|
183 |
|
def []=(*args)
|
184 |
|
value = args.pop
|
185 |
|
|
186 |
|
if args.first.is_a? Integer
|
187 |
|
if @row[args.first].nil? # extending past the end with index
|
188 |
|
@row[args.first] = [nil, value]
|
189 |
|
@row.map! { |pair| pair.nil? ? [nil, nil] : pair }
|
190 |
|
else # normal index assignment
|
191 |
|
@row[args.first][1] = value
|
192 |
|
end
|
193 |
|
else
|
194 |
|
index = index(*args)
|
195 |
|
if index.nil? # appending a field
|
196 |
|
self << [args.first, value]
|
197 |
|
else # normal header assignment
|
198 |
|
@row[index][1] = value
|
199 |
|
end
|
200 |
|
end
|
201 |
|
end
|
202 |
|
|
203 |
|
#
|
204 |
|
# :call-seq:
|
205 |
|
# <<( field )
|
206 |
|
# <<( header_and_field_array )
|
207 |
|
# <<( header_and_field_hash )
|
208 |
|
#
|
209 |
|
# If a two-element Array is provided, it is assumed to be a header and field
|
210 |
|
# and the pair is appended. A Hash works the same way with the key being
|
211 |
|
# the header and the value being the field. Anything else is assumed to be
|
212 |
|
# a lone field which is appended with a +nil+ header.
|
213 |
|
#
|
214 |
|
# This method returns the row for chaining.
|
215 |
|
#
|
216 |
|
def <<(arg)
|
217 |
|
if arg.is_a?(Array) and arg.size == 2 # appending a header and name
|
218 |
|
@row << arg
|
219 |
|
elsif arg.is_a?(Hash) # append header and name pairs
|
220 |
|
arg.each { |pair| @row << pair }
|
221 |
|
else # append field value
|
222 |
|
@row << [nil, arg]
|
223 |
|
end
|
224 |
|
|
225 |
|
self # for chaining
|
226 |
|
end
|
227 |
|
|
228 |
|
#
|
229 |
|
# A shortcut for appending multiple fields. Equivalent to:
|
230 |
|
#
|
231 |
|
# args.each { |arg| faster_csv_row << arg }
|
232 |
|
#
|
233 |
|
# This method returns the row for chaining.
|
234 |
|
#
|
235 |
|
def push(*args)
|
236 |
|
args.each { |arg| self << arg }
|
237 |
|
|
238 |
|
self # for chaining
|
239 |
|
end
|
240 |
|
|
241 |
|
#
|
242 |
|
# :call-seq:
|
243 |
|
# delete( header )
|
244 |
|
# delete( header, offset )
|
245 |
|
# delete( index )
|
246 |
|
#
|
247 |
|
# Used to remove a pair from the row by +header+ or +index+. The pair is
|
248 |
|
# located as described in FasterCSV::Row.field(). The deleted pair is
|
249 |
|
# returned, or +nil+ if a pair could not be found.
|
250 |
|
#
|
251 |
|
def delete(header_or_index, minimum_index = 0)
|
252 |
|
if header_or_index.is_a? Integer # by index
|
253 |
|
@row.delete_at(header_or_index)
|
254 |
|
else # by header
|
255 |
|
@row.delete_at(index(header_or_index, minimum_index))
|
256 |
|
end
|
257 |
|
end
|
258 |
|
|
259 |
|
#
|
260 |
|
# The provided +block+ is passed a header and field for each pair in the row
|
261 |
|
# and expected to return +true+ or +false+, depending on whether the pair
|
262 |
|
# should be deleted.
|
263 |
|
#
|
264 |
|
# This method returns the row for chaining.
|
265 |
|
#
|
266 |
|
def delete_if(&block)
|
267 |
|
@row.delete_if(&block)
|
268 |
|
|
269 |
|
self # for chaining
|
270 |
|
end
|
271 |
|
|
272 |
|
#
|
273 |
|
# This method accepts any number of arguments which can be headers, indices,
|
274 |
|
# Ranges of either, or two-element Arrays containing a header and offset.
|
275 |
|
# Each argument will be replaced with a field lookup as described in
|
276 |
|
# FasterCSV::Row.field().
|
277 |
|
#
|
278 |
|
# If called with no arguments, all fields are returned.
|
279 |
|
#
|
280 |
|
def fields(*headers_and_or_indices)
|
281 |
|
if headers_and_or_indices.empty? # return all fields--no arguments
|
282 |
|
@row.map { |pair| pair.last }
|
283 |
|
else # or work like values_at()
|
284 |
|
headers_and_or_indices.inject(Array.new) do |all, h_or_i|
|
285 |
|
all + if h_or_i.is_a? Range
|
286 |
|
index_begin = h_or_i.begin.is_a?(Integer) ? h_or_i.begin :
|
287 |
|
index(h_or_i.begin)
|
288 |
|
index_end = h_or_i.end.is_a?(Integer) ? h_or_i.end :
|
289 |
|
index(h_or_i.end)
|
290 |
|
new_range = h_or_i.exclude_end? ? (index_begin...index_end) :
|
291 |
|
(index_begin..index_end)
|
292 |
|
fields.values_at(new_range)
|
293 |
|
else
|
294 |
|
[field(*Array(h_or_i))]
|
295 |
|
end
|
296 |
|
end
|
297 |
|
end
|
298 |
|
end
|
299 |
|
alias_method :values_at, :fields
|
300 |
|
|
301 |
|
#
|
302 |
|
# :call-seq:
|
303 |
|
# index( header )
|
304 |
|
# index( header, offset )
|
305 |
|
#
|
306 |
|
# This method will return the index of a field with the provided +header+.
|
307 |
|
# The +offset+ can be used to locate duplicate header names, as described in
|
308 |
|
# FasterCSV::Row.field().
|
309 |
|
#
|
310 |
|
def index(header, minimum_index = 0)
|
311 |
|
# find the pair
|
312 |
|
index = headers[minimum_index..-1].index(header)
|
313 |
|
# return the index at the right offset, if we found one
|
314 |
|
index.nil? ? nil : index + minimum_index
|
315 |
|
end
|
316 |
|
|
317 |
|
# Returns +true+ if +name+ is a header for this row, and +false+ otherwise.
|
318 |
|
def header?(name)
|
319 |
|
headers.include? name
|
320 |
|
end
|
321 |
|
alias_method :include?, :header?
|
322 |
|
|
323 |
|
#
|
324 |
|
# Returns +true+ if +data+ matches a field in this row, and +false+
|
325 |
|
# otherwise.
|
326 |
|
#
|
327 |
|
def field?(data)
|
328 |
|
fields.include? data
|
329 |
|
end
|
330 |
|
|
331 |
|
include Enumerable
|
332 |
|
|
333 |
|
#
|
334 |
|
# Yields each pair of the row as header and field tuples (much like
|
335 |
|
# iterating over a Hash).
|
336 |
|
#
|
337 |
|
# Support for Enumerable.
|
338 |
|
#
|
339 |
|
# This method returns the row for chaining.
|
340 |
|
#
|
341 |
|
def each(&block)
|
342 |
|
@row.each(&block)
|
343 |
|
|
344 |
|
self # for chaining
|
345 |
|
end
|
346 |
|
|
347 |
|
#
|
348 |
|
# Returns +true+ if this row contains the same headers and fields in the
|
349 |
|
# same order as +other+.
|
350 |
|
#
|
351 |
|
def ==(other)
|
352 |
|
@row == other.row
|
353 |
|
end
|
354 |
|
|
355 |
|
#
|
356 |
|
# Collapses the row into a simple Hash. Be warning that this discards field
|
357 |
|
# order and clobbers duplicate fields.
|
358 |
|
#
|
359 |
|
def to_hash
|
360 |
|
# flatten just one level of the internal Array
|
361 |
|
Hash[*@row.inject(Array.new) { |ary, pair| ary.push(*pair) }]
|
362 |
|
end
|
363 |
|
|
364 |
|
#
|
365 |
|
# Returns the row as a CSV String. Headers are not used. Equivalent to:
|
366 |
|
#
|
367 |
|
# faster_csv_row.fields.to_csv( options )
|
368 |
|
#
|
369 |
|
def to_csv(options = Hash.new)
|
370 |
|
fields.to_csv(options)
|
371 |
|
end
|
372 |
|
alias_method :to_s, :to_csv
|
373 |
|
|
374 |
|
# A summary of fields, by header.
|
375 |
|
def inspect
|
376 |
|
str = "#<#{self.class}"
|
377 |
|
each do |header, field|
|
378 |
|
str << " #{header.is_a?(Symbol) ? header.to_s : header.inspect}:" <<
|
379 |
|
field.inspect
|
380 |
|
end
|
381 |
|
str << ">"
|
382 |
|
end
|
383 |
|
end
|
384 |
|
|
385 |
|
#
|
386 |
|
# A FasterCSV::Table is a two-dimensional data structure for representing CSV
|
387 |
|
# documents. Tables allow you to work with the data by row or column,
|
388 |
|
# manipulate the data, and even convert the results back to CSV, if needed.
|
389 |
|
#
|
390 |
|
# All tables returned by FasterCSV will be constructed from this class, if
|
391 |
|
# header row processing is activated.
|
392 |
|
#
|
393 |
|
class Table
|
394 |
|
#
|
395 |
|
# Construct a new FasterCSV::Table from +array_of_rows+, which are expected
|
396 |
|
# to be FasterCSV::Row objects. All rows are assumed to have the same
|
397 |
|
# headers.
|
398 |
|
#
|
399 |
|
# A FasterCSV::Table object supports the following Array methods through
|
400 |
|
# delegation:
|
401 |
|
#
|
402 |
|
# * empty?()
|
403 |
|
# * length()
|
404 |
|
# * size()
|
405 |
|
#
|
406 |
|
def initialize(array_of_rows)
|
407 |
|
@table = array_of_rows
|
408 |
|
@mode = :col_or_row
|
409 |
|
end
|
410 |
|
|
411 |
|
# The current access mode for indexing and iteration.
|
412 |
|
attr_reader :mode
|
413 |
|
|
414 |
|
# Internal data format used to compare equality.
|
415 |
|
attr_reader :table
|
416 |
|
protected :table
|
417 |
|
|
418 |
|
### Array Delegation ###
|
419 |
|
|
420 |
|
extend Forwardable
|
421 |
|
def_delegators :@table, :empty?, :length, :size
|
422 |
|
|
423 |
|
#
|
424 |
|
# Returns a duplicate table object, in column mode. This is handy for
|
425 |
|
# chaining in a single call without changing the table mode, but be aware
|
426 |
|
# that this method can consume a fair amount of memory for bigger data sets.
|
427 |
|
#
|
428 |
|
# This method returns the duplicate table for chaining. Don't chain
|
429 |
|
# destructive methods (like []=()) this way though, since you are working
|
430 |
|
# with a duplicate.
|
431 |
|
#
|
432 |
|
def by_col
|
433 |
|
self.class.new(@table.dup).by_col!
|
434 |
|
end
|
435 |
|
|
436 |
|
#
|
437 |
|
# Switches the mode of this table to column mode. All calls to indexing and
|
438 |
|
# iteration methods will work with columns until the mode is changed again.
|
439 |
|
#
|
440 |
|
# This method returns the table and is safe to chain.
|
441 |
|
#
|
442 |
|
def by_col!
|
443 |
|
@mode = :col
|
444 |
|
|
445 |
|
self
|
446 |
|
end
|
447 |
|
|
448 |
|
#
|
449 |
|
# Returns a duplicate table object, in mixed mode. This is handy for
|
450 |
|
# chaining in a single call without changing the table mode, but be aware
|
451 |
|
# that this method can consume a fair amount of memory for bigger data sets.
|
452 |
|
#
|
453 |
|
# This method returns the duplicate table for chaining. Don't chain
|
454 |
|
# destructive methods (like []=()) this way though, since you are working
|
455 |
|
# with a duplicate.
|
456 |
|
#
|
457 |
|
def by_col_or_row
|
458 |
|
self.class.new(@table.dup).by_col_or_row!
|
459 |
|
end
|
460 |
|
|
461 |
|
#
|
462 |
|
# Switches the mode of this table to mixed mode. All calls to indexing and
|
463 |
|
# iteration methods will use the default intelligent indexing system until
|
464 |
|
# the mode is changed again. In mixed mode an index is assumed to be a row
|
465 |
|
# reference while anything else is assumed to be column access by headers.
|
466 |
|
#
|
467 |
|
# This method returns the table and is safe to chain.
|
468 |
|
#
|
469 |
|
def by_col_or_row!
|
470 |
|
@mode = :col_or_row
|
471 |
|
|
472 |
|
self
|
473 |
|
end
|
474 |
|
|
475 |
|
#
|
476 |
|
# Returns a duplicate table object, in row mode. This is handy for chaining
|
477 |
|
# in a single call without changing the table mode, but be aware that this
|
478 |
|
# method can consume a fair amount of memory for bigger data sets.
|
479 |
|
#
|
480 |
|
# This method returns the duplicate table for chaining. Don't chain
|
481 |
|
# destructive methods (like []=()) this way though, since you are working
|
482 |
|
# with a duplicate.
|
483 |
|
#
|
484 |
|
def by_row
|
485 |
|
self.class.new(@table.dup).by_row!
|
486 |
|
end
|
487 |
|
|
488 |
|
#
|
489 |
|
# Switches the mode of this table to row mode. All calls to indexing and
|
490 |
|
# iteration methods will work with rows until the mode is changed again.
|
491 |
|
#
|
492 |
|
# This method returns the table and is safe to chain.
|
493 |
|
#
|
494 |
|
def by_row!
|
495 |
|
@mode = :row
|
496 |
|
|
497 |
|
self
|
498 |
|
end
|
499 |
|
|
500 |
|
#
|
501 |
|
# Returns the headers for the first row of this table (assumed to match all
|
502 |
|
# other rows). An empty Array is returned for empty tables.
|
503 |
|
#
|
504 |
|
def headers
|
505 |
|
if @table.empty?
|
506 |
|
Array.new
|
507 |
|
else
|
508 |
|
@table.first.headers
|
509 |
|
end
|
510 |
|
end
|
511 |
|
|
512 |
|
#
|
513 |
|
# In the default mixed mode, this method returns rows for index access and
|
514 |
|
# columns for header access. You can force the index association by first
|
515 |
|
# calling by_col!() or by_row!().
|
516 |
|
#
|
517 |
|
# Columns are returned as an Array of values. Altering that Array has no
|
518 |
|
# effect on the table.
|
519 |
|
#
|
520 |
|
def [](index_or_header)
|
521 |
|
if @mode == :row or # by index
|
522 |
|
(@mode == :col_or_row and index_or_header.is_a? Integer)
|
523 |
|
@table[index_or_header]
|
524 |
|
else # by header
|
525 |
|
@table.map { |row| row[index_or_header] }
|
526 |
|
end
|
527 |
|
end
|
528 |
|
|
529 |
|
#
|
530 |
|
# In the default mixed mode, this method assigns rows for index access and
|
531 |
|
# columns for header access. You can force the index association by first
|
532 |
|
# calling by_col!() or by_row!().
|
533 |
|
#
|
534 |
|
# Rows may be set to an Array of values (which will inherit the table's
|
535 |
|
# headers()) or a FasterCSV::Row.
|
536 |
|
#
|
537 |
|
# Columns may be set to a single value, which is copied to each row of the
|
538 |
|
# column, or an Array of values. Arrays of values are assigned to rows top
|
539 |
|
# to bottom in row major order. Excess values are ignored and if the Array
|
540 |
|
# does not have a value for each row the extra rows will receive a +nil+.
|
541 |
|
#
|
542 |
|
# Assigning to an existing column or row clobbers the data. Assigning to
|
543 |
|
# new columns creates them at the right end of the table.
|
544 |
|
#
|
545 |
|
def []=(index_or_header, value)
|
546 |
|
if @mode == :row or # by index
|
547 |
|
(@mode == :col_or_row and index_or_header.is_a? Integer)
|
548 |
|
if value.is_a? Array
|
549 |
|
@table[index_or_header] = Row.new(headers, value)
|
550 |
|
else
|
551 |
|
@table[index_or_header] = value
|
552 |
|
end
|
553 |
|
else # set column
|
554 |
|
if value.is_a? Array # multiple values
|
555 |
|
@table.each_with_index do |row, i|
|
556 |
|
if row.header_row?
|
557 |
|
row[index_or_header] = index_or_header
|
558 |
|
else
|
559 |
|
row[index_or_header] = value[i]
|
560 |
|
end
|
561 |
|
end
|
562 |
|
else # repeated value
|
563 |
|
@table.each do |row|
|
564 |
|
if row.header_row?
|
565 |
|
row[index_or_header] = index_or_header
|
566 |
|
else
|
567 |
|
row[index_or_header] = value
|
568 |
|
end
|
569 |
|
end
|
570 |
|
end
|
571 |
|
end
|
572 |
|
end
|
573 |
|
|
574 |
|
#
|
575 |
|
# The mixed mode default is to treat a list of indices as row access,
|
576 |
|
# returning the rows indicated. Anything else is considered columnar
|
577 |
|
# access. For columnar access, the return set has an Array for each row
|
578 |
|
# with the values indicated by the headers in each Array. You can force
|
579 |
|
# column or row mode using by_col!() or by_row!().
|
580 |
|
#
|
581 |
|
# You cannot mix column and row access.
|
582 |
|
#
|
583 |
|
def values_at(*indices_or_headers)
|
584 |
|
if @mode == :row or # by indices
|
585 |
|
( @mode == :col_or_row and indices_or_headers.all? do |index|
|
586 |
|
index.is_a?(Integer) or
|
587 |
|
( index.is_a?(Range) and
|
588 |
|
index.first.is_a?(Integer) and
|
589 |
|
index.last.is_a?(Integer) )
|
590 |
|
end )
|
591 |
|
@table.values_at(*indices_or_headers)
|
592 |
|
else # by headers
|
593 |
|
@table.map { |row| row.values_at(*indices_or_headers) }
|
594 |
|
end
|
595 |
|
end
|
596 |
|
|
597 |
|
#
|
598 |
|
# Adds a new row to the bottom end of this table. You can provide an Array,
|
599 |
|
# which will be converted to a FasterCSV::Row (inheriting the table's
|
600 |
|
# headers()), or a FasterCSV::Row.
|
601 |
|
#
|
602 |
|
# This method returns the table for chaining.
|
603 |
|
#
|
604 |
|
def <<(row_or_array)
|
605 |
|
if row_or_array.is_a? Array # append Array
|
606 |
|
@table << Row.new(headers, row_or_array)
|
607 |
|
else # append Row
|
608 |
|
@table << row_or_array
|
609 |
|
end
|
610 |
|
|
611 |
|
self # for chaining
|
612 |
|
end
|
613 |
|
|
614 |
|
#
|
615 |
|
# A shortcut for appending multiple rows. Equivalent to:
|
616 |
|
#
|
617 |
|
# rows.each { |row| self << row }
|
618 |
|
#
|
619 |
|
# This method returns the table for chaining.
|
620 |
|
#
|
621 |
|
def push(*rows)
|
622 |
|
rows.each { |row| self << row }
|
623 |
|
|
624 |
|
self # for chaining
|
625 |
|
end
|
626 |
|
|
627 |
|
#
|
628 |
|
# Removes and returns the indicated column or row. In the default mixed
|
629 |
|
# mode indices refer to rows and everything else is assumed to be a column
|
630 |
|
# header. Use by_col!() or by_row!() to force the lookup.
|
631 |
|
#
|
632 |
|
def delete(index_or_header)
|
633 |
|
if @mode == :row or # by index
|
634 |
|
(@mode == :col_or_row and index_or_header.is_a? Integer)
|
635 |
|
@table.delete_at(index_or_header)
|
636 |
|
else # by header
|
637 |
|
@table.map { |row| row.delete(index_or_header).last }
|
638 |
|
end
|
639 |
|
end
|
640 |
|
|
641 |
|
#
|
642 |
|
# Removes any column or row for which the block returns +true+. In the
|
643 |
|
# default mixed mode or row mode, iteration is the standard row major
|
644 |
|
# walking of rows. In column mode, interation will +yield+ two element
|
645 |
|
# tuples containing the column name and an Array of values for that column.
|
646 |
|
#
|
647 |
|
# This method returns the table for chaining.
|
648 |
|
#
|
649 |
|
def delete_if(&block)
|
650 |
|
if @mode == :row or @mode == :col_or_row # by index
|
651 |
|
@table.delete_if(&block)
|
652 |
|
else # by header
|
653 |
|
to_delete = Array.new
|
654 |
|
headers.each_with_index do |header, i|
|
655 |
|
to_delete << header if block[[header, self[header]]]
|
656 |
|
end
|
657 |
|
to_delete.map { |header| delete(header) }
|
658 |
|
end
|
659 |
|
|
660 |
|
self # for chaining
|
661 |
|
end
|
662 |
|
|
663 |
|
include Enumerable
|
664 |
|
|
665 |
|
#
|
666 |
|
# In the default mixed mode or row mode, iteration is the standard row major
|
667 |
|
# walking of rows. In column mode, interation will +yield+ two element
|
668 |
|
# tuples containing the column name and an Array of values for that column.
|
669 |
|
#
|
670 |
|
# This method returns the table for chaining.
|
671 |
|
#
|
672 |
|
def each(&block)
|
673 |
|
if @mode == :col
|
674 |
|
headers.each { |header| block[[header, self[header]]] }
|
675 |
|
else
|
676 |
|
@table.each(&block)
|
677 |
|
end
|
678 |
|
|
679 |
|
self # for chaining
|
680 |
|
end
|
681 |
|
|
682 |
|
# Returns +true+ if all rows of this table ==() +other+'s rows.
|
683 |
|
def ==(other)
|
684 |
|
@table == other.table
|
685 |
|
end
|
686 |
|
|
687 |
|
#
|
688 |
|
# Returns the table as an Array of Arrays. Headers will be the first row,
|
689 |
|
# then all of the field rows will follow.
|
690 |
|
#
|
691 |
|
def to_a
|
692 |
|
@table.inject([headers]) do |array, row|
|
693 |
|
if row.header_row?
|
694 |
|
array
|
695 |
|
else
|
696 |
|
array + [row.fields]
|
697 |
|
end
|
698 |
|
end
|
699 |
|
end
|
700 |
|
|
701 |
|
#
|
702 |
|
# Returns the table as a complete CSV String. Headers will be listed first,
|
703 |
|
# then all of the field rows.
|
704 |
|
#
|
705 |
|
def to_csv(options = Hash.new)
|
706 |
|
@table.inject([headers.to_csv(options)]) do |rows, row|
|
707 |
|
if row.header_row?
|
708 |
|
rows
|
709 |
|
else
|
710 |
|
rows + [row.fields.to_csv(options)]
|
711 |
|
end
|
712 |
|
end.join
|
713 |
|
end
|
714 |
|
alias_method :to_s, :to_csv
|
715 |
|
|
716 |
|
def inspect
|
717 |
|
"#<#{self.class} mode:#{@mode} row_count:#{to_a.size}>"
|
718 |
|
end
|
719 |
|
end
|
720 |
|
|
721 |
|
# The error thrown when the parser encounters illegal CSV formatting.
|
722 |
|
class MalformedCSVError < RuntimeError; end
|
723 |
|
|
724 |
|
#
|
725 |
|
# A FieldInfo Struct contains details about a field's position in the data
|
726 |
|
# source it was read from. FasterCSV will pass this Struct to some blocks
|
727 |
|
# that make decisions based on field structure. See
|
728 |
|
# FasterCSV.convert_fields() for an example.
|
729 |
|
#
|
730 |
|
# <b><tt>index</tt></b>:: The zero-based index of the field in its row.
|
731 |
|
# <b><tt>line</tt></b>:: The line of the data source this row is from.
|
732 |
|
# <b><tt>header</tt></b>:: The header for the column, when available.
|
733 |
|
#
|
734 |
|
FieldInfo = Struct.new(:index, :line, :header)
|
735 |
|
|
736 |
|
# A Regexp used to find and convert some common Date formats.
|
737 |
|
DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} |
|
738 |
|
\d{4}-\d{2}-\d{2} )\z /x
|
739 |
|
# A Regexp used to find and convert some common DateTime formats.
|
740 |
|
DateTimeMatcher =
|
741 |
|
/ \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} |
|
742 |
|
\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} )\z /x
|
743 |
|
#
|
744 |
|
# This Hash holds the built-in converters of FasterCSV that can be accessed by
|
745 |
|
# name. You can select Converters with FasterCSV.convert() or through the
|
746 |
|
# +options+ Hash passed to FasterCSV::new().
|
747 |
|
#
|
748 |
|
# <b><tt>:integer</tt></b>:: Converts any field Integer() accepts.
|
749 |
|
# <b><tt>:float</tt></b>:: Converts any field Float() accepts.
|
750 |
|
# <b><tt>:numeric</tt></b>:: A combination of <tt>:integer</tt>
|
751 |
|
# and <tt>:float</tt>.
|
752 |
|
# <b><tt>:date</tt></b>:: Converts any field Date::parse() accepts.
|
753 |
|
# <b><tt>:date_time</tt></b>:: Converts any field DateTime::parse() accepts.
|
754 |
|
# <b><tt>:all</tt></b>:: All built-in converters. A combination of
|
755 |
|
# <tt>:date_time</tt> and <tt>:numeric</tt>.
|
756 |
|
#
|
757 |
|
# This Hash is intetionally left unfrozen and users should feel free to add
|
758 |
|
# values to it that can be accessed by all FasterCSV objects.
|
759 |
|
#
|
760 |
|
# To add a combo field, the value should be an Array of names. Combo fields
|
761 |
|
# can be nested with other combo fields.
|
762 |
|
#
|
763 |
|
Converters = { :integer => lambda { |f| Integer(f) rescue f },
|
764 |
|
:float => lambda { |f| Float(f) rescue f },
|
765 |
|
:numeric => [:integer, :float],
|
766 |
|
:date => lambda { |f|
|
767 |
|
f =~ DateMatcher ? (Date.parse(f) rescue f) : f
|
768 |
|
},
|
769 |
|
:date_time => lambda { |f|
|
770 |
|
f =~ DateTimeMatcher ? (DateTime.parse(f) rescue f) : f
|
771 |
|
},
|
772 |
|
:all => [:date_time, :numeric] }
|
773 |
|
|
774 |
|
#
|
775 |
|
# This Hash holds the built-in header converters of FasterCSV that can be
|
776 |
|
# accessed by name. You can select HeaderConverters with
|
777 |
|
# FasterCSV.header_convert() or through the +options+ Hash passed to
|
778 |
|
# FasterCSV::new().
|
779 |
|
#
|
780 |
|
# <b><tt>:downcase</tt></b>:: Calls downcase() on the header String.
|
781 |
|
# <b><tt>:symbol</tt></b>:: The header String is downcased, spaces are
|
782 |
|
# replaced with underscores, non-word characters
|
783 |
|
# are dropped, and finally to_sym() is called.
|
784 |
|
#
|
785 |
|
# This Hash is intetionally left unfrozen and users should feel free to add
|
786 |
|
# values to it that can be accessed by all FasterCSV objects.
|
787 |
|
#
|
788 |
|
# To add a combo field, the value should be an Array of names. Combo fields
|
789 |
|
# can be nested with other combo fields.
|
790 |
|
#
|
791 |
|
HeaderConverters = {
|
792 |
|
:downcase => lambda { |h| h.downcase },
|
793 |
|
:symbol => lambda { |h|
|
794 |
|
h.downcase.tr(" ", "_").delete("^a-z0-9_").to_sym
|
795 |
|
}
|
796 |
|
}
|
797 |
|
|
798 |
|
#
|
799 |
|
# The options used when no overrides are given by calling code. They are:
|
800 |
|
#
|
801 |
|
# <b><tt>:col_sep</tt></b>:: <tt>","</tt>
|
802 |
|
# <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
|
803 |
|
# <b><tt>:quote_char</tt></b>:: <tt>'"'</tt>
|
804 |
|
# <b><tt>:converters</tt></b>:: +nil+
|
805 |
|
# <b><tt>:unconverted_fields</tt></b>:: +nil+
|
806 |
|
# <b><tt>:headers</tt></b>:: +false+
|
807 |
|
# <b><tt>:return_headers</tt></b>:: +false+
|
808 |
|
# <b><tt>:header_converters</tt></b>:: +nil+
|
809 |
|
# <b><tt>:skip_blanks</tt></b>:: +false+
|
810 |
|
# <b><tt>:force_quotes</tt></b>:: +false+
|
811 |
|
#
|
812 |
|
DEFAULT_OPTIONS = { :col_sep => ",",
|
813 |
|
:row_sep => :auto,
|
814 |
|
:quote_char => '"',
|
815 |
|
:converters => nil,
|
816 |
|
:unconverted_fields => nil,
|
817 |
|
:headers => false,
|
818 |
|
:return_headers => false,
|
819 |
|
:header_converters => nil,
|
820 |
|
:skip_blanks => false,
|
821 |
|
:force_quotes => false }.freeze
|
822 |
|
|
823 |
|
#
|
824 |
|
# This method will build a drop-in replacement for many of the standard CSV
|
825 |
|
# methods. It allows you to write code like:
|
826 |
|
#
|
827 |
|
# begin
|
828 |
|
# require "faster_csv"
|
829 |
|
# FasterCSV.build_csv_interface
|
830 |
|
# rescue LoadError
|
831 |
|
# require "csv"
|
832 |
|
# end
|
833 |
|
# # ... use CSV here ...
|
834 |
|
#
|
835 |
|
# This is not a complete interface with completely identical behavior.
|
836 |
|
# However, it is intended to be close enough that you won't notice the
|
837 |
|
# difference in most cases. CSV methods supported are:
|
838 |
|
#
|
839 |
|
# * foreach()
|
840 |
|
# * generate_line()
|
841 |
|
# * open()
|
842 |
|
# * parse()
|
843 |
|
# * parse_line()
|
844 |
|
# * readlines()
|
845 |
|
#
|
846 |
|
# Be warned that this interface is slower than vanilla FasterCSV due to the
|
847 |
|
# extra layer of method calls. Depending on usage, this can slow it down to
|
848 |
|
# near CSV speeds.
|
849 |
|
#
|
850 |
|
def self.build_csv_interface
|
851 |
|
Object.const_set(:CSV, Class.new).class_eval do
|
852 |
|
def self.foreach(path, rs = :auto, &block) # :nodoc:
|
853 |
|
FasterCSV.foreach(path, :row_sep => rs, &block)
|
854 |
|
end
|
855 |
|
|
856 |
|
def self.generate_line(row, fs = ",", rs = "") # :nodoc:
|
857 |
|
FasterCSV.generate_line(row, :col_sep => fs, :row_sep => rs)
|
858 |
|
end
|
859 |
|
|
860 |
|
def self.open(path, mode, fs = ",", rs = :auto, &block) # :nodoc:
|
861 |
|
if block and mode.include? "r"
|
862 |
|
FasterCSV.open(path, mode, :col_sep => fs, :row_sep => rs) do |csv|
|
863 |
|
csv.each(&block)
|
864 |
|
end
|
865 |
|
else
|
866 |
|
FasterCSV.open(path, mode, :col_sep => fs, :row_sep => rs, &block)
|
867 |
|
end
|
868 |
|
end
|
869 |
|
|
870 |
|
def self.parse(str_or_readable, fs = ",", rs = :auto, &block) # :nodoc:
|
871 |
|
FasterCSV.parse(str_or_readable, :col_sep => fs, :row_sep => rs, &block)
|
872 |
|
end
|
873 |
|
|
874 |
|
def self.parse_line(src, fs = ",", rs = :auto) # :nodoc:
|
875 |
|
FasterCSV.parse_line(src, :col_sep => fs, :row_sep => rs)
|
876 |
|
end
|
877 |
|
|
878 |
|
def self.readlines(path, rs = :auto) # :nodoc:
|
879 |
|
FasterCSV.readlines(path, :row_sep => rs)
|
880 |
|
end
|
881 |
|
end
|
882 |
|
end
|
883 |
|
|
884 |
|
#
|
885 |
|
# This method allows you to serialize an Array of Ruby objects to a String or
|
886 |
|
# File of CSV data. This is not as powerful as Marshal or YAML, but perhaps
|
887 |
|
# useful for spreadsheet and database interaction.
|
888 |
|
#
|
889 |
|
# Out of the box, this method is intended to work with simple data objects or
|
890 |
|
# Structs. It will serialize a list of instance variables and/or
|
891 |
|
# Struct.members().
|
892 |
|
#
|
893 |
|
# If you need need more complicated serialization, you can control the process
|
894 |
|
# by adding methods to the class to be serialized.
|
895 |
|
#
|
896 |
|
# A class method csv_meta() is responsible for returning the first row of the
|
897 |
|
# document (as an Array). This row is considered to be a Hash of the form
|
898 |
|
# key_1,value_1,key_2,value_2,... FasterCSV::load() expects to find a class
|
899 |
|
# key with a value of the stringified class name and FasterCSV::dump() will
|
900 |
|
# create this, if you do not define this method. This method is only called
|
901 |
|
# on the first object of the Array.
|
902 |
|
#
|
903 |
|
# The next method you can provide is an instance method called csv_headers().
|
904 |
|
# This method is expected to return the second line of the document (again as
|
905 |
|
# an Array), which is to be used to give each column a header. By default,
|
906 |
|
# FasterCSV::load() will set an instance variable if the field header starts
|
907 |
|
# with an @ character or call send() passing the header as the method name and
|
908 |
|
# the field value as an argument. This method is only called on the first
|
909 |
|
# object of the Array.
|
910 |
|
#
|
911 |
|
# Finally, you can provide an instance method called csv_dump(), which will
|
912 |
|
# be passed the headers. This should return an Array of fields that can be
|
913 |
|
# serialized for this object. This method is called once for every object in
|
914 |
|
# the Array.
|
915 |
|
#
|
916 |
|
# The +io+ parameter can be used to serialize to a File, and +options+ can be
|
917 |
|
# anything FasterCSV::new() accepts.
|
918 |
|
#
|
919 |
|
def self.dump(ary_of_objs, io = "", options = Hash.new)
|
920 |
|
obj_template = ary_of_objs.first
|
921 |
|
|
922 |
|
csv = FasterCSV.new(io, options)
|
923 |
|
|
924 |
|
# write meta information
|
925 |
|
begin
|
926 |
|
csv << obj_template.class.csv_meta
|
927 |
|
rescue NoMethodError
|
928 |
|
csv << [:class, obj_template.class]
|
929 |
|
end
|
930 |
|
|
931 |
|
# write headers
|
932 |
|
begin
|
933 |
|
headers = obj_template.csv_headers
|
934 |
|
rescue NoMethodError
|
935 |
|
headers = obj_template.instance_variables.sort
|
936 |
|
if obj_template.class.ancestors.find { |cls| cls.to_s =~ /\AStruct\b/ }
|
937 |
|
headers += obj_template.members.map { |mem| "#{mem}=" }.sort
|
938 |
|
end
|
939 |
|
end
|
940 |
|
csv << headers
|
941 |
|
|
942 |
|
# serialize each object
|
943 |
|
ary_of_objs.each do |obj|
|
944 |
|
begin
|
945 |
|
csv << obj.csv_dump(headers)
|
946 |
|
rescue NoMethodError
|
947 |
|
csv << headers.map do |var|
|
948 |
|
if var[0] == ?@
|
949 |
|
obj.instance_variable_get(var)
|
950 |
|
else
|
951 |
|
obj[var[0..-2]]
|
952 |
|
end
|
953 |
|
end
|
954 |
|
end
|
955 |
|
end
|
956 |
|
|
957 |
|
if io.is_a? String
|
958 |
|
csv.string
|
959 |
|
else
|
960 |
|
csv.close
|
961 |
|
end
|
962 |
|
end
|
963 |
|
|
964 |
|
#
|
965 |
|
# :call-seq:
|
966 |
|
# filter( options = Hash.new ) { |row| ... }
|
967 |
|
# filter( input, options = Hash.new ) { |row| ... }
|
968 |
|
# filter( input, output, options = Hash.new ) { |row| ... }
|
969 |
|
#
|
970 |
|
# This method is a convenience for building Unix-like filters for CSV data.
|
971 |
|
# Each row is yielded to the provided block which can alter it as needed.
|
972 |
|
# After the block returns, the row is appended to +output+ altered or not.
|
973 |
|
#
|
974 |
|
# The +input+ and +output+ arguments can be anything FasterCSV::new() accepts
|
975 |
|
# (generally String or IO objects). If not given, they default to
|
976 |
|
# <tt>ARGF</tt> and <tt>$stdout</tt>.
|
977 |
|
#
|
978 |
|
# The +options+ parameter is also filtered down to FasterCSV::new() after some
|
979 |
|
# clever key parsing. Any key beginning with <tt>:in_</tt> or
|
980 |
|
# <tt>:input_</tt> will have that leading identifier stripped and will only
|
981 |
|
# be used in the +options+ Hash for the +input+ object. Keys starting with
|
982 |
|
# <tt>:out_</tt> or <tt>:output_</tt> affect only +output+. All other keys
|
983 |
|
# are assigned to both objects.
|
984 |
|
#
|
985 |
|
# The <tt>:output_row_sep</tt> +option+ defaults to
|
986 |
|
# <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
|
987 |
|
#
|
988 |
|
def self.filter(*args)
|
989 |
|
# parse options for input, output, or both
|
990 |
|
in_options, out_options = Hash.new, {:row_sep => $INPUT_RECORD_SEPARATOR}
|
991 |
|
if args.last.is_a? Hash
|
992 |
|
args.pop.each do |key, value|
|
993 |
|
case key.to_s
|
994 |
|
when /\Ain(?:put)?_(.+)\Z/
|
995 |
|
in_options[$1.to_sym] = value
|
996 |
|
when /\Aout(?:put)?_(.+)\Z/
|
997 |
|
out_options[$1.to_sym] = value
|
998 |
|
else
|
999 |
|
in_options[key] = value
|