[Vimoutliner] Parsing outlines with awk?
Ned Konz
metamagix at gmail.com
Sat Mar 31 12:12:08 EST 2007
Steve Litt wrote:
> Has anyone used awk to parse outlines? How did it work for you? I just learned
> awk, and implemented stacks in awk (I don't think awk has stacks natively,
> but it's trivial to make). Theoretically, armed with stacks one could parse
> and translate outlines, although the lack of complex variables (C structures,
> Ruby classes, etc) makes it a little harder.
Hi Steve,
Sorry, I haven't used Awk for parsing outlines for a while. I looked
around for my last version of that and couldn't find it. As I recall,
though, I used a state machine approach (Awk's pattern/block structure
maps well to FSMs).
Attached is my Ruby parser for my own (TVO) outline structure. You'd
need to change some of the regular expressions for your text additions,
etc. but it may be helpful.
It actually contains the parser class as well as a few formatters.
--
Ned Konz
ned at bike-nomad.com
http://bike-nomad.com
-------------- next part --------------
#!/usr/bin/env ruby
# Module for parsing/formatting TVO outlines.
# $Id: otlParser.rb 126 2006-12-07 21:30:27Z ned $
require 'stringio'
require 'getoptlong'
module TVO
RETodo1 = /\b(TODO|XXX|NOTE)\b/
REStandout = /\*\*\s*\b(.+?)\b\s*\*\*/
RETagDef = /<id=([^>]+)>|\[\[([^\[\]]+)\]\]/i
REExternTagRef = /<url:\s*([^>]+)\s*>|\[([a-z]+:[^\]]+)\]/i
RETagRef = /\[([^\[\]:]+)\]/
REVimTagRef = /\[(:[^\[\]:]+)\]/
REHTMLOnly = /(.*?)\s*<html:\s*([^>]+|<[^>]+>)\s*>\s*(.*)/i
REItalic = /I<(.+?)>/
REBold = /B<(.+?)>/
RECode = /C<(.+?)>/
REUnderline = /U<(.+?)>/
RETextLeader = /^\t*\|\s*/
REText = /^\t*\|\s*(.*)/
# outlineItem := head text? outlineItem*
#
# from Vim syntax definition:
# text
# contains=vikiHyperLinks,RETodo,RETagDef,RETagRef,RETextLeader nextgroup=REText
# RETabs : /^\t\{0-9}[^\t|].*/
# contains=vikiHyperLinks,RETodo,RETagDef,RETagRef nextgroup=RETabs,REText
# vikiHyperLinks = vikiLink,vikiExtendedLink,vikiURL,vikiInexistentLink
#
class Item
public
attr_accessor :parent, :keepHead
attr_reader :head, :children, :level
def initialize(level, head='', text=nil, parent=nil, children=[])
@level = level
@head = head
@text = text ? text.to_a.join("\n").split("\n") : []
@children = children
@parent = parent
@keepHead = false
self
end
def head=(headText)
if headText.nil?
@head = headText
elsif headText[0..0] == '+'
@keepHead = true
@head = headText[1..-1]
else
@head = headText
end
end
def addText(text)
@text.push(text)
end
def addChild(child)
@children.push(child)
child.parent = self
end
def children=(_children)
_children.each { |c| addChild(c) }
end
def text
@text
end
def text=(_text)
@text = _text.split("\n")
end
def each_text_line(&blk)
text.each(&blk)
end
# returns array of arrays [marker, para]
# marker is '' or '-' or '*'
# para is array of paragraph lines
def textParagraphs
paras = []
thisPara = []
markerLength = 0
marker = ''
text.each do |textline|
case textline
when /^(\s*([-*])\s*)(.*)/
paras.push([ marker, thisPara ])
marker = $2
markerLength = $1.length
thisPara = [ $3 ]
when /^(\s*)(.+)/
if $1.length == markerLength
thisPara.push($2)
else
paras.push([ marker, thisPara ])
markerLength = $1.length
thisPara = [ $2 ]
marker = ''
end
when /^\s*$/
paras.push([ marker, thisPara ])
thisPara = []
marker = ''
end
end
paras.push([ marker, thisPara ])
return paras.reject { |p| p[1].length == 0 }
end
# calls given block with:
# array of related lines
# marker ('' or '-' or '*')
def relatedTextParagraphsDo
lastMarker = ''
related = []
paras = textParagraphs
paras.push([nil, []]) # to flush last one
paras.each do |p|
marker = p[0]
textLines = p[1]
if marker == lastMarker
related.push(textLines)
else
# process related paragraphs if any
if related.length > 0
yield related, lastMarker
end
lastMarker = marker
related = [ textLines ]
end
end
end
end
class Formatter
protected
# default output is just flattened.
def printHead?(item)
return (!@textOnly || item.keepHead)
end
def visitHead(item,seq=0)
return unless printHead?(item)
file().puts(embellish(item.head), "")
end
def visitText(item,seq=0)
item.text.each { |textLine| file.puts(embellish(textLine)) }
file.puts("") if item.text.length > 0
end
def visitItem(item,seq=0)
if item.level >= 0
visitHead(item,seq)
visitText(item,seq)
end
item.children.each_with_index { |ch,n| visitItem(ch,n) }
nil
end
# format individual spans
def italic(text) ; text; end
def bold(text) ; text; end
def code(text) ; text; end
def underline(text) ; text; end
def standout(text) ; text; end
def tagDef(text) ; text; end
def tagRef(text) ; text; end
def vimTagRef(text) ; text; end
def htmlOnly(text); end
def embellish(text)
text.
gsub(REItalic) { |s| italic($1) }.
gsub(REBold) { |s| bold($1) }.
gsub(RECode) { |s| code($1) }.
gsub(REUnderline) { |s| underline($1) }.
gsub(REStandout) { |s| standout($1) }.
gsub(RETagDef) { |s| tagDef($1||$2) }.
gsub(RETagRef) { |s| tagRef($1) }.
gsub(REVimTagRef) { |s| vimTagRef($1) }.
gsub(REHTMLOnly) { |s| htmlOnly($1) }
end
public
def self.formatterNames
TVO.constants.
select { |c|
cl = TVO.const_get(c) rescue ''
cl.kind_of?(Class) && cl <= self
}.collect { |cn| cn.sub(/Formatter$/, '') }.
sort
end
attr_accessor :file, :textOnly
def initialize(_file=$stdout)
@file = _file
@textOnly = false
end
def format(outlineRoot)
visitItem(outlineRoot)
end
end
# Output TVO again (for building OTL files programmatically)
class OutlineFormatter < Formatter
protected
Prefixes = (0..9).to_a.collect { |n| ("\t" * n) }
def prefixForLevel(level)
Prefixes[level] || ((level < 0) ? "" : ("\t" * level))
end
def visitHead(item,seq=0)
return unless printHead?(item)
file.print(prefixForLevel(item.level), item.head, "\n")
end
def visitText(item,seq=0)
prefix = prefixForLevel(item.level) + '| '
item.text.each { |tline| file.print(prefix, tline, "\n") }
end
end
# Format outline as h1-h6/ul
# Classes used are:
# <a href="">
# otlExternTagRef
# otlTagRef
# <a name="">
# otlTagDef
# <span>
# otlHTMLOnly
# otlTodo
# otlStandout
# otlVimTagRef
# otlUnderline
# <hr>
# h1 .. h<#>
# <h1> .. <h5>
# h1 .. h6
# <h6>
# h6 .. h<#>
# <ul>,<li>
# t<#>pd (if marker was '-')
# t<#>pa (if marker was '*')
# <div>,<p>
# t<#>p
#
#
class HTMLFormatter < Formatter
def self.quoted(text)
text.gsub(/&/, '&'). gsub(/</, '<'). gsub(/>/, '>')
end
def self.requoted(re)
Regexp.new(re.source.gsub(/\\\\/, '\\'). gsub(/</, '<'). gsub(/>/, '>'))
end
RETagDef = /<id=([^&]+)>|\[\[([^\[\]]+)\]\]/i
REExternTagRef = /<url:\s*([^>]+)\s*>|\[([a-z]+:[^\]]+)\]/i
RETagRef = /\[([^\[\]:&]+)\]/
REVimTagRef = /\[(:[^\[\]:&]+)\]/
REHTMLOnly = /(.*?)\s*<html:\s*(.+?|.*<.+?>)\s*>\s*(.*)/i
REItalic = /I<(.+?)>/
REBold = /B<(.+?)>/
RECode = /C<(.+?)>/
REUnderline = /U<(.+?)>/
REGtLt = /&&([gl]t;)/
protected
# notice in-text markings
# Would be run after quoted
def decorated(textLine)
if textLine.match(REHTMLOnly)
return textLine.
gsub(REHTMLOnly) do |s|
"#{decorated($1)} <span class=\"otlHTMLOnly\">#{$2}</span> #{decorated($3)}"
end
else
return textLine.
gsub(REGtLt) {|s| "&#{$1}" }.
gsub(REItalic) {|s| "<i>#{$1}</i>" }.
gsub(REBold) {|s| "<strong>#{$1}</strong>" }.
gsub(RECode) {|s| "<tt>#{$1}</tt>" }.
gsub(REUnderline) {|s| "<span class=\"otlUnderline\">#{$1}</span>" }.
gsub(RETodo1) {|s| "<span class=\"otlTodo\">#{s}</span>" }.
gsub(REStandout) {|s| "<span class=\"otlStandout\">#{$1}</span>" }.
gsub(RETagDef) {|s| "<a class=\"otlTagDef\" name=\"#{urlEncoded($1||$2)}\"></a>" }.
gsub(REExternTagRef){|s|
"<a class=\"otlExternTagRef\" href=\"#{urlEncoded($1||$2)}\">#{$1||$2}</a>" }.
gsub(RETagRef) do |s|
url=dest=$1
if dest =~ /^--\s*(.+)\s*--$/
url = dest = $1
end
if dest =~ /^([^#]+)#([^#]+)$/
url=dest
dest=$1
end
if File.readable?(dest)
"<a class=\"otlExternTagRef\" href=\"#{urlEncoded(url)}\">#{url}</a>"
else
"<a class=\"otlTagRef\" href=\"##{urlEncoded(url)}\">#{url}</a>"
end
end.
gsub(REVimTagRef) { "<span class=\"otlVimTagRef\">#{$&}</span>" }
end
end
def quoted(text)
self.class.quoted(text).gsub(/\n/, "\n" + (" " * @nest))
end
def urlEncoded(text)
text.gsub(/[^#.A-Za-z0-9]/) { |c| sprintf("%%%02X", c[0]) }
end
def htmlTag(tagname, attribs={})
file.print("\n", " " * @nest)
file.print('<', tagname)
attribs.each_pair { |k,v| file.print(" #{k}=\"#{quoted(v)}\"") }
if block_given?
file.print('>')
@nest += 1
text = yield
@nest -= 1
file.print(decorated(quoted(text))) if text
file.print("\n", '</', tagname, '>')
else
file.print(' />')
end
nil
end
def tagAndClassForHead(itemLevel)
hLevel = "h#{itemLevel}"
tag = (itemLevel.between?(1,6) ? hLevel : 'h6')
return *[tag, hLevel]
end
def visitHead(item,seq=0)
return unless printHead?(item)
itemLevel = item.level + 1
(tag, hLevel) = tagAndClassForHead(itemLevel)
if itemLevel == 1 && seq > 0
htmlTag('hr', { :class => hLevel })
end
htmlTag(tag, { :class => hLevel } ) { item.head }
end
def tagsAndClassForTextPara(itemLevel,marker)
case marker
when '-'
return *['ul','li',"t#{itemLevel}pd"]
when '*'
return *['ul','li',"t#{itemLevel}pa"]
else
return *['div','p',"t#{itemLevel}p"]
end
end
def formatTextParagraph(para, itemTag, itemClass)
htmlTag(itemTag, {:class => itemClass }) { para.join("\n") }
end
def visitText(item,seq=0)
item.relatedTextParagraphsDo do |related, marker|
(groupTag,itemTag,itemClass) = tagsAndClassForTextPara(item.level,marker)
htmlTag(groupTag, {:class => itemClass }) do
related.each { |p| formatTextParagraph(p, itemTag, itemClass) }
nil
end
end
end
public
attr_accessor :stylesheet
def self.defaultStylesheet
"tvo.css"
end
def initialize(_file=$stdout)
super
@nest = 0
@stylesheet = self.class.defaultStylesheet
end
def format(outlineRoot)
file.print('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
htmlTag('html') do
htmlTag('head') do
htmlTag('title') { outlineRoot.children[0].head }
htmlTag('link', { :rel => 'stylesheet', :type => 'text/css', :href => stylesheet() })
end
htmlTag('body') { visitItem(outlineRoot) }
end
end
end
# Format entire outline as nested series of ul/li/ul
class HTMLListFormatter < HTMLFormatter
def visitHead(item,seq=0)
return unless printHead?(item)
hLevel = "h#{item.level + 1}"
htmlTag('li', { :class => hLevel } ) { item.head }
end
def visitItem(item, seq=0)
hLevel = "h#{item.level + 1}"
if item.level >= 0
if item.level == 0 && seq > 0
htmlTag('hr', { :class => hLevel })
end
visitHead(item,seq)
visitText(item,seq)
end
if item.children.length > 0
htmlTag('ul', { :class => hLevel }) do
item.children.each_with_index { |ch,n| visitItem(ch,n) }
nil
end
end
end
end
# Construct an OutlineItem (the root item) from input text.
class Parser
protected
@@debug = false
def gets(sepString = $/)
retval = @getback.gets(sepString) || @file.gets(sepString)
if retval.nil?
$stderr.puts("(EOF)") if @@debug
throw(:eof, nil)
end
return retval.chomp(sepString)
end
def puts(line)
@pushback.puts(line)
@pushback.sync
end
def head(level)
$stderr.print("Looking for head(#{level})") if @@debug
line = gets
if m = line.match(@tabREs[level]) and m[2][0..0] != '|'
$stderr.print("... got #{m[2]}\n") if @@debug
return m[2]
else
puts(line)
$stderr.print("... rej #{line.inspect}\n") if @@debug
return nil
end
end
def text(level)
$stderr.print("Looking for text(#{level})") if @@debug
line = gets
if m = line.match(@tabREs[level]) and m[2].match(/\|\s?(.*)/)
$stderr.print("... got #{m[2][2..-1]}\n") if @@debug
return $1
else
puts(line)
$stderr.print("... rej #{line.inspect}\n") if @@debug
return nil
end
end
# get next outline item that starts with (at least) "level" tabs.
# return single item.
def item(level)
throw(:toodeep, nil) if level > 9
catch(:eof) do
catch(:toodeep) do
retval = Item.new(level)
retval.head = head(level)
return nil unless retval.head
catch(:eof) do
while t = text(level)
retval.addText(t)
end
end
retval.children = items(level+1)
$stderr.puts("Returning item [level=#{level}] [head=\"#{retval.head}\"] [text=#{retval.text.length}lns]") if @@debug
retval
end
end
end
# return array of items at the given level.
def items(level)
retval = []
while nextItem = item(level)
retval.push(nextItem)
end
return retval
end
public
def initialize(file=$stdin)
@pbString = ""
@pushback = StringIO.new(@pbString)
@getback = StringIO.new(@pbString)
@file=file
@tabREs = (0..9).to_a.collect { |n| Regexp.new("^(\\t{#{n}})(\\S.*)") }
end
def outline
return Item.new(-1, '', nil, nil, items(0))
end
def Parser.debugMode=(bool)
@@debug = bool
end
end # class Parser
# read options from ARGV
def parseAndFormat
# parse arguments
formatType = ''
outputFileName = nil
textOnly = false
parser = GetoptLong.new
parser.set_options(
[ '--format', '-f', GetoptLong::REQUIRED_ARGUMENT],
[ '--help', '-h', GetoptLong::NO_ARGUMENT],
[ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT],
[ '--debug', '-d', GetoptLong::NO_ARGUMENT],
[ '--stylesheet', '-s', GetoptLong::REQUIRED_ARGUMENT],
[ '--include', '-i', GetoptLong::REQUIRED_ARGUMENT],
[ '--textonly', '-t', GetoptLong::NO_ARGUMENT])
parser.each_option do |name, arg|
case name
when '--format'
formatType = arg
when '--help'
$stderr.print <<-EOF
Usage: #{$0} [opt] [file [...]]
opt is one or more of:
--format, -f #{ "<'" + Formatter.formatterNames().join("'|'") + "'>" } set output format type
--help, -h display this help
--output, -o <filename> output to file named filename instead of stdout
--debug, -d turn on parser debugging to stderr
--textonly, -t omit heads except those starting with '+'
--stylesheet, -s <filename> link to stylesheet named filename (default=#{HTMLFormatter.defaultStylesheet})
--include, -i <filename> include Ruby module filename
EOF
exit(0)
when '--output'
outputFileName = arg
when '--debug'
Parser.debugMode = true
when '--textonly'
textOnly = true
when '--include'
require arg
end
end
outputFile = outputFileName.nil? ? $stdout : File.open(outputFileName, 'w')
outline = Parser.new(ARGF).outline
formatterClass = TVO.const_get("#{formatType}Formatter")
formatter = formatterClass.new(outputFile)
formatter.textOnly = textOnly
formatter.format(outline)
end
end # module TVO
if $0 == __FILE__
include TVO
parseAndFormat
end
More information about the VimOutliner
mailing list