From e506c5172a81883b7f01a9b3de4daad222cf5e72 Mon Sep 17 00:00:00 2001
From: Mike West <mike@mikewest.org>
Date: Thu, 21 Oct 2010 13:45:47 +0200
Subject: [PATCH] Skipping Python/Ruby 1.9 source encoding

In the same way that it makes sense to skip the shebang (#!) line in
scripts, it makes sense to skip the encoding definition in Python files
(described by [PEP 263][p]) and Ruby 1.9 files (similar enough syntax
that it's not worth worrying about.

[p]: http://www.python.org/dev/peps/pep-0263/
---
 lib/rocco.rb       |  6 +++--
 test/rocco_test.rb | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/lib/rocco.rb b/lib/rocco.rb
index 30db0f7..0b2cbb6 100644
--- a/lib/rocco.rb
+++ b/lib/rocco.rb
@@ -209,7 +209,7 @@ class Rocco
         "js"            =>  { :single => "//",  :multi => { :start => "/**", :middle => "*", :end => "*/" } },
         "lua"           =>  { :single => "--",  :multi => nil },
         "python"        =>  { :single => "#",   :multi => { :start => '"""', :middle => nil, :end => '"""' } },
-        "ruby"          =>  { :single => "#",   :multi => nil },
+        "rb"            =>  { :single => "#",   :multi => nil },
         "scheme"        =>  { :single => ";;",  :multi => nil },
       }
         
@@ -227,12 +227,14 @@ class Rocco
   # Parse the raw file data into a list of two-tuples. Each tuple has the
   # form `[docs, code]` where both elements are arrays containing the
   # raw lines parsed from the input file. The first line is ignored if it
-  # is a shebang line.
+  # is a shebang line.  We also ignore the PEP 263 encoding information in
+  # python sourcefiles, and the similar ruby 1.9 syntax.
   def parse(data)
     sections = []
     docs, code = [], []
     lines = data.split("\n")
     lines.shift if lines[0] =~ /^\#\!/
+    lines.shift if lines[0] =~ /coding[:=]\s*[-\w.]+/ and [ "python", "rb" ].include? @options[:language]
     lines.each do |line|
       case line
       when @comment_pattern
diff --git a/test/rocco_test.rb b/test/rocco_test.rb
index 58597a8..58fcc3b 100644
--- a/test/rocco_test.rb
+++ b/test/rocco_test.rb
@@ -82,6 +82,63 @@ class RoccoBasicTests < Test::Unit::TestCase
 
 end
 
+class RoccoSkippableLines < Test::Unit::TestCase
+  def test_shebang_first_line
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Shebang should be stripped when it appears as the first line."
+    )
+  end
+  def test_shebang_in_content
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1", "#!/usr/bin/env bash" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "# Comment 1\n#!/usr/bin/env bash\ndef codeblock\n# Comment 2\nend\n" ),
+        "Shebang shouldn't be stripped anywhere other than as the first line."
+    )
+  end
+  def test_encoding_in_ruby
+    r = Rocco.new( 'filename.rb' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+  def test_encoding_in_python
+    r = Rocco.new( 'filename.py' ) { "" }
+    assert_equal(
+        [
+            [ [ "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+  def test_encoding_in_notpython
+    r = Rocco.new( 'filename.sh' ) { "" }
+    assert_equal(
+        [
+            [ [ "# encoding: utf-8", "# Comment 1" ], [ "def codeblock" ] ],
+            [ [ "# Comment 2" ], [ "end" ] ]
+        ],
+        r.parse( "#!/usr/bin/env bash\n# encoding: utf-8\n# Comment 1\ndef codeblock\n# Comment 2\nend\n" ),
+        "Strings matching the PEP 263 encoding definition regex should be stripped when they appear at the top of a python document."
+    )
+  end
+end
 class RoccoLanguageDetection < Test::Unit::TestCase
     def test_basic_detection
         r = Rocco.new( 'filename.py' ) { "" }