Specify encoding for Pygments

This closes issue #10, in theory, but I'm not completely happy with the
behavior.  The output for both UTF-8 and ISO-8859-1 sources is arguably
correct, but I think it'd be better to do some autodetecting of the file
encoding, and explicitly convert everything to UTF-8 on input.  One
option is the [`chardet` gem][gem], but I'm loath to add another
dependency to Rocco...

[gem]: http://rubygems.org/gems/chardet/versions/0.9.0
This commit is contained in:
Mike West 2010-10-19 13:32:03 +02:00
parent 38683a8cc2
commit 1b211bcc08
5 changed files with 16 additions and 5 deletions

View File

@ -194,7 +194,7 @@ class Rocco
# then fork off a child process to write the input. # then fork off a child process to write the input.
def highlight_pygmentize(code) def highlight_pygmentize(code)
code_html = nil code_html = nil
open("|pygmentize -l #{@options[:language]} -f html", 'r+') do |fd| open("|pygmentize -l #{@options[:language]} -O encoding=utf-8 -f html", 'r+') do |fd|
pid = pid =
fork { fork {
fd.close_read fd.close_read

1
test/fixtures/issue10.iso-8859-1.rb vendored Normal file
View File

@ -0,0 +1 @@
# hello wörld

View File

@ -1 +0,0 @@
hello ąćęłńóśźż

1
test/fixtures/issue10.utf-8.rb vendored Normal file
View File

@ -0,0 +1 @@
# hello ąćęłńóśźż

View File

@ -101,10 +101,20 @@ class RoccoIssueTests < Test::Unit::TestCase
def test_issue10_utf8_processing def test_issue10_utf8_processing
# Rocco has issues with strange UTF-8 characters: need to explicitly set the encoding for Pygments # Rocco has issues with strange UTF-8 characters: need to explicitly set the encoding for Pygments
# http://github.com/rtomayko/rocco/issues#issue/10 # http://github.com/rtomayko/rocco/issues#issue/10
r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.rb" ) r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.utf-8.rb" )
assert_equal( assert_equal(
"<p>hello ąćęłńóśźż</p>\n", "<p>hello ąćęłńóśźż</p>\n",
r.sections[0][0] r.sections[0][0],
"UTF-8 input files ought behave correctly."
)
# and, just for grins, ensure that iso-8859-1 works too.
# @TODO: Is this really the correct behavior? Converting text
# to UTF-8 on the way out is probably preferable.
r = Rocco.new( File.dirname(__FILE__) + "/fixtures/issue10.iso-8859-1.rb" )
assert_equal(
"<p>hello w\366rld</p>\n",
r.sections[0][0],
"ISO-8859-1 input should probably also behave correctly."
) )
end end
def test_issue12_css_octothorpe_classname_change def test_issue12_css_octothorpe_classname_change