Skipping Python/Ruby 1.9 source encoding

In the same way that it makes sense to skip the shebang (#!) line in scripts, it makes sense to skip the encoding definition in Python files (described by [PEP 263][p]) and Ruby 1.9 files (similar enough syntax that it's not worth worrying about. [p]: http://www.python.org/dev/peps/pep-0263/
2010-10-21 13:45:47 +02:00 · 2010-10-21 13:45:47 +02:00 · e506c5172a
commit e506c5172a
parent a4d0e41413
2 changed files with 61 additions and 2 deletions
--- a/lib/rocco.rb
+++ b/lib/rocco.rb
@ -209,7 +209,7 @@ class Rocco
        "js"            =>  { :single => "//",  :multi => { :start => "/**", :middle => "*", :end => "*/" } },
        "lua"           =>  { :single => "--",  :multi => nil },
        "python"        =>  { :single => "#",   :multi => { :start => '"""', :middle => nil, :end => '"""' } },
-        "ruby"          =>  { :single => "#",   :multi => nil },
+        "rb"            =>  { :single => "#",   :multi => nil },
        "scheme"        =>  { :single => ";;",  :multi => nil },
      }
        
@ -227,12 +227,14 @@ class Rocco
  # Parse the raw file data into a list of two-tuples. Each tuple has the
  # form `[docs, code]` where both elements are arrays containing the
  # raw lines parsed from the input file. The first line is ignored if it
-  # is a shebang line.
+  # is a shebang line.  We also ignore the PEP 263 encoding information in
+  # python sourcefiles, and the similar ruby 1.9 syntax.
  def parse(data)
    sections = []
    docs, code = [], []
    lines = data.split("\n")
    lines.shift if lines[0] =~ /^\#\!/
+    lines.shift if lines[0] =~ /coding[:=]\s*[-\w.]+/ and [ "python", "rb" ].include? @options[:language]
    lines.each do |line|
      case line
      when @comment_pattern
--- a/test/rocco_test.rb
+++ b/test/rocco_test.rb
@ -82,6 +82,63 @@ class RoccoBasicTests < Test::Unit::TestCase

 end

+class RoccoSkippableLines < Test::Unit::TestCase
+  def test_shebang_first_line
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Shebang should be stripped when it appears as the first line."
+    )
+  end
+  def test_shebang_in_content
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1", "#!/usr/bin/env bash" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "# Comment 1\n#!/usr/bin/env bash\ndef codeblock\n# Comment 2\nend\n" ),
+        "Shebang shouldn't be stripped anywhere other than as the first line."
+    )
+  end
+  def test_encoding_in_ruby
+    r = Rocco.new( 'filename.rb' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+  def test_encoding_in_python
+    r = Rocco.new( 'filename.py' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+  def test_encoding_in_notpython
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# encoding: utf-8", "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+end
 class RoccoLanguageDetection < Test::Unit::TestCase
    def test_basic_detection
        r = Rocco.new( 'filename.py' ) { "" }