rocco/lib/rocco.rb

# **Rocco** is a Ruby port of [Docco][do], the quick-and-dirty,
# hundred-line-long, literate-programming-style documentation generator.
#
# Rocco reads Ruby source files and produces annotated source documentation
# in HTML format. Comments are formatted with [Markdown][md] and presented
# alongside syntax highlighted code so as to give an annotation effect.
# This page is the result of running Rocco against [its own source file][so].
#
# Most of this was written while waiting for [node.js][no] to build (so I
# could use Docco!). Docco's gorgeous HTML and CSS are taken verbatim.
# The main difference is that Rocco is written in Ruby instead of
# [CoffeeScript][co] and may be a bit easier to obtain and install in
# existing Ruby environments or where node doesn't run yet.
#
# Install Rocco with Rubygems:
#
#     gem install rocco
#
# Once installed, the `rocco` command can be used to generate documentation
# for a set of Ruby source files:
#
#     rocco lib/*.rb
#
# The HTML files are written to the current working directory.
#
# [no]: http://nodejs.org/
# [do]: http://jashkenas.github.com/docco/
# [co]: http://coffeescript.org/
# [md]: http://daringfireball.net/projects/markdown/
# [so]: http://github.com/rtomayko/rocco/blob/master/lib/rocco.rb#commit

#### Prerequisites

# We'll need a Markdown library. [RDiscount][rd], if we're lucky. Otherwise,
# issue a warning and fall back on using BlueCloth.
#
# [rd]: http://github.com/rtomayko/rdiscount
begin
  require 'rdiscount'
rescue LoadError => boom
  warn "WARNING: #{boom}. Trying bluecloth."
  require 'bluecloth'
  Markdown = BlueCloth
end

# We use [{{ mustache }}](http://defunkt.github.com/mustache/) for
# HTML templating.
require 'mustache'

# We use `Net::HTTP` to highlight code via <http://pygments.appspot.com>
require 'net/http'

# Code is run through [Pygments](http://pygments.org/) for syntax
# highlighting. If it's not installed, locally, use a webservice.
if !ENV['PATH'].split(':').any? { |dir| File.executable?("#{dir}/pygmentize") }
  warn "WARNING: Pygments not found. Using webservice."
end

#### Public Interface

# `Rocco.new` takes a source `filename`, an optional list of source filenames
# for other documentation sources, an `options` hash, and an optional `block`.
# The `options` hash respects three members:
#
# * `:language`: specifies which Pygments lexer to use if one can't be
#   auto-detected from the filename.  _Defaults to `ruby`_.
#
# * `:comment_chars`, which specifies the comment characters of the
#   target language. _Defaults to `#`_.
#
# * `:template_file`, which specifies a external template file to use
#   when rendering the final, highlighted file via Mustache.  _Defaults
#   to `nil` (that is, Mustache will use `./lib/rocco/layout.mustache`)_.
#
class Rocco
  VERSION = '0.7'

  def initialize(filename, sources=[], options={}, &block)
    @file       = filename
    @sources    = sources

    # When `block` is given, it must read the contents of the file using
    # whatever means necessary and return it as a string. With no `block`,
    # the file is read to retrieve data.
    @data =
      if block_given?
        yield
      else
        File.read(filename)
      end

    defaults = {
      :language      => 'ruby',
      :comment_chars => '#',
      :template_file => nil
    }
    @options = defaults.merge(options)

    # If we detect a language
    if detect_language() != "text"
      # then assign the detected language to `:language`, and look for
      # comment characters based on that language
      @options[:language] = detect_language()
      @options[:comment_chars] = generate_comment_chars()

    # If we didn't detect a language, but the user provided one, use it
    # to look around for comment characters to override the default.
    elsif @options[:language] != defaults[:language]
      @options[:comment_chars] = generate_comment_chars()

    # If neither is true, then convert the default comment character string
    # into the comment_char syntax (we'll discuss that syntax in detail when
    # we get to `generate_comment_chars()` in a moment.
    else
      @options[:comment_chars] = {
        :single => @options[:comment_chars],
        :multi => nil
      }
    end

    # Turn `:comment_chars` into a regex matching a series of spaces, the
    # `:comment_chars` string, and the an optional space.  We'll use that
    # to detect single-line comments.
    @comment_pattern =
      Regexp.new("^\\s*#{@options[:comment_chars][:single]}\s?")

    # `parse()` the file contents stored in `@data`.  Run the result through
    # `split()` and that result through `highlight()` to generate the final
    # section list.
    @sections = highlight(split(parse(@data)))
  end

  # The filename as given to `Rocco.new`.
  attr_reader :file

  # The merged options array
  attr_reader :options

  # A list of two-tuples representing each *section* of the source file. Each
  # item in the list has the form: `[docs_html, code_html]`, where both
  # elements are strings containing the documentation and source code HTML,
  # respectively.
  attr_reader :sections

  # A list of all source filenames included in the documentation set. Useful
  # for building an index of other files.
  attr_reader :sources

  # Generate HTML output for the entire document.
  require 'rocco/layout'
  def to_html
    Rocco::Layout.new(self, @options[:template_file]).render
  end

  # Helper Functions
  # ----------------

  # Returns `true` if `pygmentize` is available locally, `false` otherwise.
  def pygmentize?
    @_pygmentize ||= ENV['PATH'].split(':').
      any? { |dir| File.executable?("#{dir}/pygmentize") }
  end

  # If `pygmentize` is available, we can use it to autodetect a file's
  # language based on its filename.  Filenames without extensions, or with
  # extensions that `pygmentize` doesn't understand will return `text`.
  # We'll also return `text` if `pygmentize` isn't available.
  #
  # We'll memoize the result, as we'll call this a few times.
  def detect_language
    @_language ||=
      if pygmentize?
        %x[pygmentize -N #{@file}].strip.split('+').first
      else
        "text"
      end
  end

  # Given a file's language, we should be able to autopopulate the
  # `comment_chars` variables for single-line comments.  If we don't
  # have comment characters on record for a given language, we'll
  # use the user-provided `:comment_char` option (which defaults to
  # `#`).
  #
  # Comment characters are listed as:
  #
  #     { :single       => "//",
  #       :multi_start  => "/**",
  #       :multi_middle => "*",
  #       :multi_end    => "*/" }
  #
  # `:single` denotes the leading character of a single-line comment.
  # `:multi_start` denotes the string that should appear alone on a
  # line of code to begin a block of documentation.  `:multi_middle`
  # denotes the leading character of block comment content, and
  # `:multi_end` is the string that ought appear alone on a line to
  # close a block of documentation.  That is:
  #
  #     /**                 [:multi][:start]
  #      *                  [:multi][:middle]
  #      ...
  #      *                  [:multi][:middle]
  #      */                 [:multi][:end]
  #
  # If a language only has one type of comment, the missing type
  # should be assigned `nil`.
  #
  # At the moment, we're only returning `:single`.  Consider this
  # groundwork for block comment parsing.
  C_STYLE_COMMENTS = {
    :single => "//",
    :multi  => { :start => "/**", :middle => "*", :end => "*/" },
    :heredoc => nil
  }
  COMMENT_STYLES  = {
    "bash"          =>  { :single => "#", :multi => nil },
    "c"             =>  C_STYLE_COMMENTS,
    "coffee-script" =>  {
      :single => "#",
      :multi  => { :start => "###", :middle => nil, :end => "###" },
      :heredoc => nil
    },
    "cpp" =>  C_STYLE_COMMENTS,
    "csharp" => C_STYLE_COMMENTS,
    "css"           =>  {
      :single => nil,
      :multi  => { :start => "/**", :middle => "*", :end => "*/" },
      :heredoc => nil
    },
    "html"           =>  {
      :single => nil,
      :multi => { :start => '<!--', :middle => nil, :end => '-->' },
      :heredoc => nil
    },
    "java"          =>  C_STYLE_COMMENTS,
    "js"            =>  C_STYLE_COMMENTS,
    "lua"           =>  {
      :single => "--",
      :multi => nil,
      :heredoc => nil
    },
    "php" => C_STYLE_COMMENTS,
    "python"        =>  {
      :single => "#",
      :multi  => { :start => '"""', :middle => nil, :end => '"""' },
      :heredoc => nil
    },
    "rb"            =>  {
      :single => "#",
      :multi  => { :start => '=begin', :middle => nil, :end => '=end' },
      :heredoc => "<<-"
    },
    "scala"         =>  C_STYLE_COMMENTS,
    "scheme"        =>  { :single => ";;",  :multi => nil, :heredoc => nil },
    "xml"           =>  {
      :single => nil,
      :multi => { :start => '<!--', :middle => nil, :end => '-->' },
      :heredoc => nil
    },
  }

  def generate_comment_chars
    @_commentchar ||=
      if COMMENT_STYLES[@options[:language]]
        COMMENT_STYLES[@options[:language]]
      else
        { :single => @options[:comment_chars], :multi => nil, :heredoc => nil }
      end
  end

  # Internal Parsing and Highlighting
  # ---------------------------------

  # Parse the raw file data into a list of two-tuples. Each tuple has the
  # form `[docs, code]` where both elements are arrays containing the
  # raw lines parsed from the input file, comment characters stripped.
  def parse(data)
    sections = []
    docs, code = [], []
    lines = data.split("\n")

    # The first line is ignored if it is a shebang line.  We also ignore the
    # PEP 263 encoding information in python sourcefiles, and the similar ruby
    # 1.9 syntax.
    lines.shift if lines[0] =~ /^\#\!/
    lines.shift if lines[0] =~ /coding[:=]\s*[-\w.]+/ &&
                   [ "python", "rb" ].include?(@options[:language])

    # To detect both block comments and single-line comments, we'll set
    # up a tiny state machine, and loop through each line of the file.
    # This requires an `in_comment_block` boolean, and a few regular
    # expressions for line tests.  We'll do the same for fake heredoc parsing.
    in_comment_block = false
    in_heredoc = false
    single_line_comment, block_comment_start, block_comment_mid, block_comment_end =
      nil, nil, nil, nil
    if not @options[:comment_chars][:single].nil?
      single_line_comment = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:single])}\\s?")
    end
    if not @options[:comment_chars][:multi].nil?
      block_comment_start = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:multi][:start])}\\s*$")
      block_comment_end   = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:multi][:end])}\\s*$")
      block_comment_one_liner = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:multi][:start])}\\s*(.*?)\\s*#{Regexp.escape(@options[:comment_chars][:multi][:end])}\\s*$")
      block_comment_start_with = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:multi][:start])}\\s*(.*?)$")
      block_comment_end_with = Regexp.new("\\s*(.*?)\\s*#{Regexp.escape(@options[:comment_chars][:multi][:end])}\\s*$")
      if @options[:comment_chars][:multi][:middle]
        block_comment_mid = Regexp.new("^\\s*#{Regexp.escape(@options[:comment_chars][:multi][:middle])}\\s?")
      end
    end
    if not @options[:comment_chars][:heredoc].nil?
      heredoc_start = Regexp.new("#{Regexp.escape(@options[:comment_chars][:heredoc])}(\\S+)$")
    end
    lines.each do |line|
      # If we're currently in a comment block, check whether the line matches
      # the _end_ of a comment block or the _end_ of a comment block with a
      # comment.
      if in_comment_block
        if block_comment_end && line.match(block_comment_end)
          in_comment_block = false
        elsif block_comment_end_with && line.match(block_comment_end_with)
          in_comment_block = false
          docs << line.match(block_comment_end_with).captures.first.
                        sub(block_comment_mid || '', '')
        else
          docs << line.sub(block_comment_mid || '', '')
        end
      # If we're currently in a heredoc, we're looking for the end of the
      # heredoc, and everything it contains is code.
      elsif in_heredoc
        if line.match(Regexp.new("^#{Regexp.escape(in_heredoc)}$"))
          in_heredoc = false
        end
        code << line
      # Otherwise, check whether the line starts a heredoc. If so, note the end
      # pattern, and the line is code.  Otherwise check whether the line matches
      # the beginning of a block, or a single-line comment all on it's lonesome.
      # In either case, if there's code, start a new section.
      else
        if heredoc_start && line.match(heredoc_start)
          in_heredoc = $1
          code << line
        elsif block_comment_one_liner && line.match(block_comment_one_liner)
          if code.any?
            sections << [docs, code]
            docs, code = [], []
          end
          docs << line.match(block_comment_one_liner).captures.first
        elsif block_comment_start && line.match(block_comment_start)
          in_comment_block = true
          if code.any?
            sections << [docs, code]
            docs, code = [], []
          end
        elsif block_comment_start_with && line.match(block_comment_start_with)
          in_comment_block = true
          if code.any?
            sections << [docs, code]
            docs, code = [], []
          end
          docs << line.match(block_comment_start_with).captures.first
        elsif single_line_comment && line.match(single_line_comment)
          if code.any?
            sections << [docs, code]
            docs, code = [], []
          end
          docs << line.sub(single_line_comment || '', '')
        else
          code << line
        end
      end
    end
    sections << [docs, code] if docs.any? || code.any?
    normalize_leading_spaces(sections)
  end

  # Normalizes documentation whitespace by checking for leading whitespace,
  # removing it, and then removing the same amount of whitespace from each
  # succeeding line.  That is:
  #
  #     def func():
  #       """
  #         Comment 1
  #         Comment 2
  #       """
  #       print "omg!"
  #
  # should yield a comment block of `Comment 1\nComment 2` and code of
  # `def func():\n  print "omg!"`
  def normalize_leading_spaces(sections)
    sections.map do |section|
      if section.any? && section[0].any?
        leading_space = section[0][0].match("^\s+")
        if leading_space
          section[0] =
            section[0].map{ |line| line.sub(/^#{leading_space.to_s}/, '') }
        end
      end
      section
    end
  end

  # Take the list of paired *sections* two-tuples and split into two
  # separate lists: one holding the comments with leaders removed and
  # one with the code blocks.
  def split(sections)
    docs_blocks, code_blocks = [], []
    sections.each do |docs,code|
      docs_blocks << docs.join("\n")
      code_blocks << code.map do |line|
        tabs = line.match(/^(\t+)/)
        tabs ? line.sub(/^\t+/, '  ' * tabs.captures[0].length) : line
      end.join("\n")
    end
    [docs_blocks, code_blocks]
  end

  # Take a list of block comments and convert Docblock @annotations to
  # Markdown syntax.
  def docblock(docs)
    docs.map do |doc|
      doc.split("\n").map do |line|
        line.match(/^@\w+/) ? line.sub(/^@(\w+)\s+/, '> **\1** ')+"  " : line
      end.join("\n")
    end
  end

  # Take the result of `split` and apply Markdown formatting to comments and
  # syntax highlighting to source code.
  def highlight(blocks)
    docs_blocks, code_blocks = blocks

    # Pre-process Docblock @annotations.
    if @options[:docblocks]
      docs_blocks = docblock(docs_blocks)
    end

    # Combine all docs blocks into a single big markdown document with section
    # dividers and run through the Markdown processor. Then split it back out
    # into separate sections.
    markdown = docs_blocks.join("\n\n##### DIVIDER\n\n")
    docs_html = process_markdown(markdown).
      split(/\n*<h5>DIVIDER<\/h5>\n*/m)

    # Combine all code blocks into a single big stream with section dividers and
    # run through either `pygmentize(1)` or <http://pygments.appspot.com>
    span, espan = '<span class="c.?">', '</span>'
    if @options[:comment_chars][:single]
      front = @options[:comment_chars][:single]
      divider_input  = "\n\n#{front} DIVIDER\n\n"
      divider_output = Regexp.new(
        [ "\\n*",
          span,
          Regexp.escape(CGI.escapeHTML(front)),
          ' DIVIDER',
          espan,
          "\\n*"
        ].join, Regexp::MULTILINE
      )
    else
      front = @options[:comment_chars][:multi][:start]
      back  = @options[:comment_chars][:multi][:end]
      divider_input  = "\n\n#{front}\nDIVIDER\n#{back}\n\n"
      divider_output = Regexp.new(
        [ "\\n*",
          span, Regexp.escape(CGI.escapeHTML(front)), espan,
          "\\n",
          span, "DIVIDER", espan,
          "\\n",
          span, Regexp.escape(CGI.escapeHTML(back)), espan,
          "\\n*"
        ].join, Regexp::MULTILINE
      )
    end

    code_stream = code_blocks.join(divider_input)

    code_html =
      if pygmentize?
        highlight_pygmentize(code_stream)
      else
        highlight_webservice(code_stream)
      end

    # Do some post-processing on the pygments output to split things back
    # into sections and remove partial `<pre>` blocks.
    code_html = code_html.
      split(divider_output).
      map { |code| code.sub(/\n?<div class="highlight"><pre>/m, '') }.
      map { |code| code.sub(/\n?<\/pre><\/div>\n/m, '') }

    # Lastly, combine the docs and code lists back into a list of two-tuples.
    docs_html.zip(code_html)
  end

  # Convert Markdown to classy HTML.
  def process_markdown(text)
    Markdown.new(text, :smart).to_html
  end

  # We `popen` a read/write pygmentize process in the parent and
  # then fork off a child process to write the input.
  def highlight_pygmentize(code)
    code_html = nil
    open("|pygmentize -l #{@options[:language]} -O encoding=utf-8 -f html", 'r+') do |fd|
      pid =
        fork {
          fd.close_read
          fd.write code
          fd.close_write
          exit!
        }
      fd.close_write
      code_html = fd.read
      fd.close_read
      Process.wait(pid)
    end

    code_html
  end

  # Pygments is not one of those things that's trivial for a ruby user to install,
  # so we'll fall back on a webservice to highlight the code if it isn't available.
  def highlight_webservice(code)
    Net::HTTP.post_form(
      URI.parse('http://pygments.appspot.com/'),
      {'lang' => @options[:language], 'code' => code}
    ).body
  end
end

# And that's it.