|
1 |
module Redmine
|
|
2 |
class TextExtractor
|
|
3 |
|
|
4 |
MAX_FULLTEXT_LENGTH = 4.megabytes
|
|
5 |
TEXT_EXTRACTORS = Redmine::Configuration['text_extractors'] || {}
|
|
6 |
|
|
7 |
def initialize(attachment)
|
|
8 |
@attachment = attachment
|
|
9 |
end
|
|
10 |
|
|
11 |
# returns the extracted fulltext or nil if no matching handler was found
|
|
12 |
# for the file type.
|
|
13 |
def text
|
|
14 |
if handler = find_handler and text = handler.text(@attachment)
|
|
15 |
text.gsub! /\s+/m, ' '
|
|
16 |
text.strip!
|
|
17 |
text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
|
|
18 |
end
|
|
19 |
rescue Exception => e
|
|
20 |
Rails.logger.error "error in fulltext extraction: #{e}"
|
|
21 |
raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc
|
|
22 |
end
|
|
23 |
|
|
24 |
private
|
|
25 |
|
|
26 |
def find_handler
|
|
27 |
@@file_handlers.detect{|h| h.accept? @attachment }
|
|
28 |
end
|
|
29 |
|
|
30 |
|
|
31 |
class FileHandler
|
|
32 |
def accept?(attachment)
|
|
33 |
if @content_type
|
|
34 |
attachment.content_type == @content_type
|
|
35 |
elsif @content_types
|
|
36 |
@content_types.include? attachment.content_type
|
|
37 |
else
|
|
38 |
false
|
|
39 |
end
|
|
40 |
end
|
|
41 |
end
|
|
42 |
|
|
43 |
class ExternalCommandHandler < FileHandler
|
|
44 |
include Redmine::Utils::Shell
|
|
45 |
|
|
46 |
FILE_PLACEHOLDER = '__FILE__'.freeze
|
|
47 |
|
|
48 |
def text(attachment)
|
|
49 |
cmd = @command.dup
|
|
50 |
cmd[cmd.index(FILE_PLACEHOLDER)] = attachment.diskfile
|
|
51 |
shellout(cmd){ |io| io.read }.to_s
|
|
52 |
end
|
|
53 |
|
|
54 |
def accept?(attachment)
|
|
55 |
super and available?
|
|
56 |
end
|
|
57 |
|
|
58 |
def available?
|
|
59 |
@command.present? and File.executable?(@command[0])
|
|
60 |
end
|
|
61 |
|
|
62 |
def self.available?
|
|
63 |
new.available?
|
|
64 |
end
|
|
65 |
end
|
|
66 |
|
|
67 |
|
|
68 |
class PdfHandler < ExternalCommandHandler
|
|
69 |
DEFAULT = [
|
|
70 |
'/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
|
|
71 |
].freeze
|
|
72 |
def initialize
|
|
73 |
@content_type = 'application/pdf'
|
|
74 |
@command = TEXT_EXTRACTORS['pdftotext'] || DEFAULT
|
|
75 |
end
|
|
76 |
end
|
|
77 |
|
|
78 |
|
|
79 |
class RtfHandler < ExternalCommandHandler
|
|
80 |
DEFAULT = [
|
|
81 |
'/usr/bin/unrtf', '--text', '__FILE__'
|
|
82 |
].freeze
|
|
83 |
def initialize
|
|
84 |
@content_type = 'application/rtf'
|
|
85 |
@command = TEXT_EXTRACTORS['unrtf'] || DEFAULT
|
|
86 |
end
|
|
87 |
end
|
|
88 |
|
|
89 |
|
|
90 |
# Handler base class for XML based (MS / Open / Libre) office documents.
|
|
91 |
class ZippedXmlHandler < FileHandler
|
|
92 |
|
|
93 |
class SaxDocument < Nokogiri::XML::SAX::Document
|
|
94 |
attr_reader :text
|
|
95 |
|
|
96 |
def initialize(text_element, text_namespace)
|
|
97 |
@element = text_element
|
|
98 |
@namespace_uri = text_namespace
|
|
99 |
@text = ''.dup
|
|
100 |
@is_text = false
|
|
101 |
end
|
|
102 |
|
|
103 |
# Handle each element, expecting the name and any attributes
|
|
104 |
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
|
|
105 |
if name == @element and uri == @namespace_uri
|
|
106 |
@is_text = true
|
|
107 |
end
|
|
108 |
end
|
|
109 |
|
|
110 |
# Any characters between the start and end element expected as a string
|
|
111 |
def characters(string)
|
|
112 |
@text << string if @is_text
|
|
113 |
end
|
|
114 |
|
|
115 |
# Given the name of an element once its closing tag is reached
|
|
116 |
def end_element_namespace(name, prefix = nil, uri = nil)
|
|
117 |
if name == @element and uri == @namespace_uri
|
|
118 |
@text << ' '
|
|
119 |
@is_text = false
|
|
120 |
end
|
|
121 |
end
|
|
122 |
end
|
|
123 |
|
|
124 |
def text(attachment)
|
|
125 |
Zip::File.open(attachment.diskfile) do |zip_file|
|
|
126 |
zip_file.each do |entry|
|
|
127 |
if entry.name == @file
|
|
128 |
return xml_to_text entry.get_input_stream
|
|
129 |
end
|
|
130 |
end
|
|
131 |
end
|
|
132 |
end
|
|
133 |
|
|
134 |
private
|
|
135 |
|
|
136 |
def xml_to_text(io)
|
|
137 |
sax_doc = SaxDocument.new @element, @namespace_uri
|
|
138 |
Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
|
|
139 |
sax_doc.text
|
|
140 |
end
|
|
141 |
end
|
|
142 |
|
|
143 |
|
|
144 |
# Base class for extractors for MS Office formats
|
|
145 |
class OfficeDocumentHandler < ZippedXmlHandler
|
|
146 |
def initialize
|
|
147 |
super
|
|
148 |
@element = 't'
|
|
149 |
end
|
|
150 |
end
|
|
151 |
|
|
152 |
|
|
153 |
class DocxHandler < OfficeDocumentHandler
|
|
154 |
def initialize
|
|
155 |
super
|
|
156 |
@content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
157 |
@file = 'word/document.xml'
|
|
158 |
@namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
159 |
end
|
|
160 |
end
|
|
161 |
|
|
162 |
|
|
163 |
class XlsxHandler < OfficeDocumentHandler
|
|
164 |
def initialize
|
|
165 |
super
|
|
166 |
@content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
|
167 |
@file = 'xl/sharedStrings.xml'
|
|
168 |
@namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
|
|
169 |
end
|
|
170 |
end
|
|
171 |
|
|
172 |
|
|
173 |
|
|
174 |
class PptxHandler < OfficeDocumentHandler
|
|
175 |
CONTENT_TYPES = [
|
|
176 |
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
177 |
'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
|
|
178 |
'application/vnd.ms-powerpoint.template.macroEnabled.12',
|
|
179 |
]
|
|
180 |
|
|
181 |
def initialize
|
|
182 |
super
|
|
183 |
@content_types = CONTENT_TYPES
|
|
184 |
@namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
|
185 |
end
|
|
186 |
|
|
187 |
def text(attachment)
|
|
188 |
slides = []
|
|
189 |
Zip::File.open(attachment.diskfile) do |zip_file|
|
|
190 |
zip_file.each do |entry|
|
|
191 |
if entry.name =~ /slide(\d+)\.xml/
|
|
192 |
slides << [$1, xml_to_text(entry.get_input_stream)]
|
|
193 |
end
|
|
194 |
end
|
|
195 |
end
|
|
196 |
slides.sort!{|a, b| a.first <=> b.first}
|
|
197 |
slides.map(&:last).join ' '
|
|
198 |
end
|
|
199 |
end
|
|
200 |
|
|
201 |
|
|
202 |
# Extractor for Open / Libre Office formats
|
|
203 |
class OpendocumentHandler < ZippedXmlHandler
|
|
204 |
CONTENT_TYPES = [
|
|
205 |
'application/vnd.oasis.opendocument.presentation',
|
|
206 |
'application/vnd.oasis.opendocument.presentation-template',
|
|
207 |
'application/vnd.oasis.opendocument.text',
|
|
208 |
'application/vnd.oasis.opendocument.text-template',
|
|
209 |
'application/vnd.oasis.opendocument.spreadsheet',
|
|
210 |
'application/vnd.oasis.opendocument.spreadsheet-template'
|
|
211 |
]
|
|
212 |
def initialize
|
|
213 |
super
|
|
214 |
@file = 'content.xml'
|
|
215 |
@content_types = CONTENT_TYPES
|
|
216 |
@element = 'p'
|
|
217 |
@namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
|
|
218 |
end
|
|
219 |
end
|
|
220 |
|
|
221 |
|
|
222 |
|
|
223 |
class DocHandler < ExternalCommandHandler
|
|
224 |
CONTENT_TYPES = [
|
|
225 |
'application/vnd.ms-word',
|
|
226 |
'application/msword',
|
|
227 |
]
|
|
228 |
DEFAULT = [
|
|
229 |
'/usr/bin/catdoc', '-dutf-8', '__FILE__'
|
|
230 |
]
|
|
231 |
def initialize
|
|
232 |
@content_types = CONTENT_TYPES
|
|
233 |
@command = TEXT_EXTRACTORS['catdoc'] || DEFAULT
|
|
234 |
end
|
|
235 |
end
|
|
236 |
|
|
237 |
class XlsHandler < ExternalCommandHandler
|
|
238 |
CONTENT_TYPES = [
|
|
239 |
'application/vnd.ms-excel',
|
|
240 |
'application/excel',
|
|
241 |
]
|
|
242 |
DEFAULT = [
|
|
243 |
'/usr/bin/xls2csv', '-dutf-8', '__FILE__'
|
|
244 |
]
|
|
245 |
def initialize
|
|
246 |
@content_types = CONTENT_TYPES
|
|
247 |
@command = TEXT_EXTRACTORS['xls2csv'] || DEFAULT
|
|
248 |
end
|
|
249 |
def text(*_)
|
|
250 |
if str = super
|
|
251 |
str.delete('"').gsub /,+/, ' '
|
|
252 |
end
|
|
253 |
end
|
|
254 |
end
|
|
255 |
|
|
256 |
class PptHandler < ExternalCommandHandler
|
|
257 |
CONTENT_TYPES = [
|
|
258 |
'application/vnd.ms-powerpoint',
|
|
259 |
'application/powerpoint',
|
|
260 |
]
|
|
261 |
DEFAULT = [
|
|
262 |
'/usr/bin/catppt', '-dutf-8', '__FILE__'
|
|
263 |
]
|
|
264 |
def initialize
|
|
265 |
@content_types = CONTENT_TYPES
|
|
266 |
@command = TEXT_EXTRACTORS['catppt'] || DEFAULT
|
|
267 |
end
|
|
268 |
end
|
|
269 |
|
|
270 |
|
|
271 |
class PlaintextHandler < FileHandler
|
|
272 |
CONTENT_TYPES = %w(text/csv text/plain)
|
|
273 |
def initialize
|
|
274 |
@content_types = CONTENT_TYPES
|
|
275 |
end
|
|
276 |
def text(attachment)
|
|
277 |
Redmine::CodesetUtil.to_utf8 IO.read(attachment.diskfile), 'UTF-8'
|
|
278 |
end
|
|
279 |
end
|
|
280 |
|
|
281 |
# the handler chain. List most specific handlers first and more general
|
|
282 |
# (fallback) handlers later.
|
|
283 |
@@file_handlers = [
|
|
284 |
PdfHandler,
|
|
285 |
OpendocumentHandler,
|
|
286 |
DocxHandler, XlsxHandler, PptxHandler,
|
|
287 |
DocHandler, XlsHandler, PptHandler,
|
|
288 |
RtfHandler,
|
|
289 |
PlaintextHandler,
|
|
290 |
].map(&:new)
|
|
291 |
|
|
292 |
end
|
|
293 |
|
|
294 |
end
|
|
295 |
|