From 1b211bcc08ddff68d52a0c124bbbe5f81c8527fe Mon Sep 17 00:00:00 2001 From: Mike West Date: Tue, 19 Oct 2010 13:32:03 +0200 Subject: [PATCH] Specify encoding for Pygments This closes issue #10, in theory, but I'm not completely happy with the behavior. The output for both UTF-8 and ISO-8859-1 sources is arguably correct, but I think it'd be better to do some autodetecting of the file encoding, and explicitly convert everything to UTF-8 on input. One option is the [`chardet` gem][gem], but I'm loath to add another dependency to Rocco... [gem]: http://rubygems.org/gems/chardet/versions/0.9.0 --- lib/rocco.rb | 2 +- test/fixtures/issue10.iso-8859-1.rb | 1 + test/fixtures/issue10.rb | 1 - test/fixtures/issue10.utf-8.rb | 1 + test/rocco_test.rb | 16 +++++++++++++--- 5 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 test/fixtures/issue10.iso-8859-1.rb delete mode 100644 test/fixtures/issue10.rb create mode 100644 test/fixtures/issue10.utf-8.rb diff --git a/lib/rocco.rb b/lib/rocco.rb index 7abda79..ec62b39 100644 --- a/lib/rocco.rb +++ b/lib/rocco.rb @@ -194,7 +194,7 @@ class Rocco # then fork off a child process to write the input. def highlight_pygmentize(code) code_html = nil - open("|pygmentize -l #{@options[:language]} -f html", 'r+') do |fd| + open("|pygmentize -l #{@options[:language]} -O encoding=utf-8 -f html", 'r+') do |fd| pid = fork { fd.close_read diff --git a/test/fixtures/issue10.iso-8859-1.rb b/test/fixtures/issue10.iso-8859-1.rb new file mode 100644 index 0000000..34d1a80 --- /dev/null +++ b/test/fixtures/issue10.iso-8859-1.rb @@ -0,0 +1 @@ +# hello wörld diff --git a/test/fixtures/issue10.rb b/test/fixtures/issue10.rb deleted file mode 100644 index 93e2940..0000000 --- a/test/fixtures/issue10.rb +++ /dev/null @@ -1 +0,0 @@ -hello ąćęłńóśźż diff --git a/test/fixtures/issue10.utf-8.rb b/test/fixtures/issue10.utf-8.rb new file mode 100644 index 0000000..e87e125 --- /dev/null +++ b/test/fixtures/issue10.utf-8.rb @@ -0,0 +1 @@ +# hello ąćęłńóśźż diff --git a/test/rocco_test.rb b/test/rocco_test.rb index ff86ed8..58c0820 100644 --- a/test/rocco_test.rb +++ b/test/rocco_test.rb @@ -101,10 +101,20 @@ class RoccoIssueTests < Test::Unit::TestCase def test_issue10_utf8_processing # Rocco has issues with strange UTF-8 characters: need to explicitly set the encoding for Pygments # http://github.com/rtomayko/rocco/issues#issue/10 - r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.rb" ) + r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.utf-8.rb" ) assert_equal( - "

hello ąćęłńóśźż

\n", - r.sections[0][0] + "

hello ąćęłńóśźż

\n", + r.sections[0][0], + "UTF-8 input files ought behave correctly." + ) + # and, just for grins, ensure that iso-8859-1 works too. + # @TODO: Is this really the correct behavior? Converting text + # to UTF-8 on the way out is probably preferable. + r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.iso-8859-1.rb" ) + assert_equal( + "

hello w\366rld

\n", + r.sections[0][0], + "ISO-8859-1 input should probably also behave correctly." ) end def test_issue12_css_octothorpe_classname_change