From 262dc4ccce3893ba6ee1cb6b7e65c93e6d935744 Mon Sep 17 00:00:00 2001 From: MAEDA Go Date: Sat, 19 Oct 2024 16:00:50 +0900 Subject: [PATCH] Fix CSV file encoding auto-detection failure with multibyte characters --- app/models/import.rb | 13 ++++++++++++- test/fixtures/files/mbcs-multiline-text.txt | 17 +++++++++++++++++ test/unit/issue_import_test.rb | 17 +++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 test/fixtures/files/mbcs-multiline-text.txt diff --git a/app/models/import.rb b/app/models/import.rb index 94e44c5e2..a27a4170d 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -69,7 +69,9 @@ class Import < ApplicationRecord encoding = lu(user, :general_csv_encoding) if file_exists? begin - content = File.read(filepath, 256) + # Reading a specified number of bytes may corrupt the trailing + # multi-byte character, so we read the first few lines instead. + content = read_head_lines separator = [',', ';'].max_by {|sep| content.count(sep)} wrapper = ['"', "'"].max_by {|quote_char| content.count(quote_char)} @@ -123,6 +125,15 @@ class Import < ApplicationRecord filepath.present? && File.exist?(filepath) end + # Reads lines from the beginning of the file, up to the specified byte limit + def read_head_lines(byte_limit = 4096) + return nil unless file_exists? + + chunk = File.read(filepath, byte_limit) + last_lf_index = chunk.rindex("\n") + last_lf_index ? chunk[..last_lf_index] : chunk + end + # Returns the headers as an array that # can be used for select options def columns_options(default=nil) diff --git a/test/fixtures/files/mbcs-multiline-text.txt b/test/fixtures/files/mbcs-multiline-text.txt new file mode 100644 index 000000000..f847113f2 --- /dev/null +++ b/test/fixtures/files/mbcs-multiline-text.txt @@ -0,0 +1,17 @@ +An emoticon is represented by 4 bytes in UTF-8 encoding. + +If you simply read the first 4096 bytes of this file, the trailing characters of a multi-byte sequence might be cut off, resulting in an invalid UTF-8 string. + +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 diff --git a/test/unit/issue_import_test.rb b/test/unit/issue_import_test.rb index ed913fe44..a199150d2 100644 --- a/test/unit/issue_import_test.rb +++ b/test/unit/issue_import_test.rb @@ -464,6 +464,23 @@ class IssueImportTest < ActiveSupport::TestCase end end + def test_encoding_guessing_respects_multibyte_boundaries + # Reading a specified number of bytes from the beginning of this file may + # stop in the middle of a multi-byte character, which can lead to an + # invalid UTF-8 string. + test_file = 'mbcs-multiline-text.txt' + chunk = File.read(Rails.root.join('test', 'fixtures', 'files', test_file), 4096) + chunk.force_encoding('UTF-8') # => "An emoticon is ...😃😄😅\xF0\x9F" + assert_not chunk.valid_encoding? + + import = generate_import(test_file) + with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do + import.set_default_settings + guessed_encoding = import.settings['encoding'] + assert_equal 'UTF-8', guessed_encoding + end + end + def test_set_default_settings_should_detect_field_wrapper to_test = { 'import_issues.csv' => '"', -- 2.45.2