#
# Second attempt at HTML to Lout converter
# Version of 03-May-2001
#
# Protect special Lout characters by quoting them
#
function protect(lin) {
gsub("@","\"@\"",lin)
gsub("\\|","\"|\"",lin)
gsub("#","\"#\"",lin)
gsub("{","\"{\"",lin)
gsub("}","\"}\"",lin)
gsub("\\^","\"^\"",lin)
gsub("~","\"~\"",lin)
gsub("/","\"/\"",lin)
gsub(/\\/,"\"\\\\\"",lin)
gsub("\"\"","",lin)
return lin
}
#
# Translate safe HTML reserved/special characters esp ISO-LATIN
#
function special(lin) {
if (lin !~ /\&/) return lin
gsub("&nsbp;","",lin)
gsub("&[Aa][Mm][Pp];","\"\\&\"",lin)
gsub("Á","{@Char Aacute}",lin)
gsub("á","{@Char aacute}",lin)
gsub("À","{@Char Agrave}",lin)
gsub("à","{@Char agrave}",lin)
gsub("Â","{@Char Acircumflex}",lin)
gsub("â","{@Char acircumflex}",lin)
gsub("Ä","{@Char Adieresis}",lin)
gsub("ä","{@Char adieresis}",lin)
gsub("Ç","{@Char Ccedilla}",lin)
gsub("ç","{@Char ccedilla}",lin)
gsub("É","{@Char Eacute}",lin)
gsub("é","{@Char eacute}",lin)
gsub("È","{@Char Egrave}",lin)
gsub("è","{@Char egrave}",lin)
gsub("Ê","{@Char Ecircumflex}",lin)
gsub("ê","{@Char ecircumflex}",lin)
gsub("Ö","{@Char Odieresis}",lin)
gsub("Í","{@Char Iacute}",lin)
gsub("í","{@Char iacute}",lin)
gsub("Ì","{@Char Igrave}",lin)
gsub("ì","{@Char igrave}",lin)
gsub("Î","{@Char Icircumflex}",lin)
gsub("î","{@Char icircumflex}",lin)
gsub("ö","{@Char odieresis}",lin)
gsub("ü","{@Char udieresis}",lin)
gsub("&[Gg][Tt];",">",lin)
gsub("&[Ll][Tt];","<",lin)
gsub("[0-9][0-9][0-9];"," ",lin)
gsub("&[A-Za-z]+;","x",lin)
gsub("&[^&;]+;"," ",lin)
return lin
}
#
# Simple tag conversion
#
function simple_convert(this_tag) {
# Remove excess whitespace
gsub("[ ]+>",">",this_tag)
gsub("<[ ]+","<",this_tag)
# Common
if (this_tag ~ /[ ]+"," ",this_tag)) return this_tag
if (sub("^<[Pp]>","@LP\n",this_tag)) return this_tag
if (sub("<[Pp]>$","\n@LP",this_tag)) return this_tag
if (sub("<[Pp]>"," @LP ",this_tag)) return this_tag
if (sub("<[Bb]>","@B { ",this_tag)) return this_tag
if (sub("[Bb]>","}",this_tag)) return this_tag
if (sub("<[Ss][Tt][Rr][Oo][Nn][Gg]>","@B { ",this_tag)) return this_tag
if (sub("[Ss][Tt][Rr][Oo][Nn][Gg]>","}",this_tag)) return this_tag
if (sub("<[Ii]>","@I { ",this_tag)) return this_tag
if (sub("[Ii]>","}",this_tag)) return this_tag
if (sub("<[Ee][Mm]>","@B { ",this_tag)) return this_tag
if (sub("[Ee][Mm]>","}",this_tag)) return this_tag
if (sub("<[Hh]1>","@LeftDisplay @Heading @Underline {",this_tag)) return this_tag
if (sub("<[Hh][23456]>","@LeftDisplay @Heading {",this_tag)) return this_tag
if (sub("[Hh][123456]>","}",this_tag)) return this_tag
if (sub("<[Uu]>","@Underline { ",this_tag)) return this_tag
if (sub("[Uu]>","}",this_tag)) return this_tag
# Less common
if (sub("<[Bb][Rr]>","\n@LLP ",this_tag)) return this_tag
if (sub("<[Cc][Ee][Nn][Tt][Ee][Rr]>","clines @Break {",this_tag)) return this_tag
if (sub("[Cc][Ee][Nn][Tt][Ee][Rr]>","}",this_tag)) return this_tag
if (sub("<[Hh][Rr]>","@LP",this_tag)) return this_tag
if (sub("<[Kk][Bb][Dd]>","@F lines @Break { ",this_tag)) return this_tag
if (sub("[Kk][Bb][Dd]>","}",this_tag)) return this_tag
if (sub("<[Pp][Rr][Ee]>","@F lines @Break {",this_tag)) return this_tag
if (sub("[Pp][Rr][Ee]>","}",this_tag)) return this_tag
if (sub("<[Ss][Uu][Pp]>"," @Sup{",this_tag)) return this_tag
if (sub("[Ss][Uu][Pp]>","}",this_tag)) return this_tag
if (sub("<[Ss][Uu][Bb]>"," @Sub{",this_tag)) return this_tag
if (sub("[Ss][Uu][Bb]>","}",this_tag)) return this_tag
if (sub("<[Tt][Tt]>","@F lines @Break { ",this_tag)) return this_tag
if (sub("[Tt][Tt]>","}",this_tag)) return this_tag
# Rare
if (sub("<[Bb][Oo][Dd][Yy]>","",this_tag)) return this_tag
if (sub("[Bb][Oo][Dd][Yy]>","",this_tag)) return this_tag
if (sub("<[Hh][Ee][Aa][Dd]>","",this_tag)) return this_tag
if (sub("[Hh][Ee][Aa][Dd]>","",this_tag)) return this_tag
if (sub("<[Hh][Tt][Mm][Ll]>","# html\n",this_tag)) return this_tag
if (sub("[Hh][Tt][Mm][Ll]>","",this_tag)) return this_tag
if (sub("<[Tt][Ii][Tt][Ll][Ee]>","@Display @Heading @Underline {",this_tag)) return this_tag
if (sub("[Tt][Ii][Tt][Ll][Ee]>","}",this_tag)) return this_tag
return this_tag
}
function getval(this_tag, var_name, var, value) {
if (match(this_tag,var_name)) {
var=substr(this_tag,RSTART+RLENGTH+1)
gsub("\"","",var)
split(var,value)
var=value[1]
}else{
var=""
}
return var
}
#
# Initialise
#
BEGIN { empty=0
quote=1; q[1]="``"; q[2]="''"
letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
for (i=1;i<=26;i++) {
cell[i]=substr(letters,i,1)
}
#
# Cleaner approach to text buffer
# Requires the RT variable to be available
#
RS="[<>]"
#
# Cleaner approach to tables
#
table_tag="<[Tt][Aa][Bb][Ll][Ee][A-Za-z0-9\"\`\'= ]*>"
rowsp_tag="[Rr][Oo][Ww][Ss][Pp][Aa][Nn]="
colsp_tag="[Cc][Oo][Ll][Ss][Pp][Aa][Nn]="
bord_tag="[Bb][Oo][Rr][Dd][Ee][Rr]="
}
#
# Header: put here so access FILENAME
#
NR==1 { printf "#\n# Lout file prepared from html file \"" FILENAME "\"\n#\n"
printf "@SysInclude { tbl }\n@SysInclude { doc }\n@Doc @Text @Begin\n\n"
}
{ if (RT=="<") {
this_text=$0
#
# Process text
# Convert quote " to Lout quotation marks `` and ''.
# Quote Lout reserved characters in the text
#
while(sub("\"",q[quote],this_text)) {
quote=3-quote
}
this_text=protect(this_text)
this_text=special(this_text)
printf "%s", this_text
}else if ($0 ~ /[A-Za-z]/) {
this_tag="<" $0 ">"
#
# Process HTML tag
#
# Simple tags first:
#
this_tag=simple_convert(this_tag)
#
# Definition list better treated as heading followed by
# optional definition in rest of entry (driven by one example I have)
#
if (sub("<[Dd][Ll]>","@IndentedList",this_tag)){ start_dl=1 }
if (start_dl) {
if (sub("<[Dd][Tt]>","@ListItem { @B { ",this_tag)) { start_dl=0 }
}else{
sub("<[Dd][Tt]>","}}\n@ListItem { @B { ",this_tag)
}
sub("<[Dd][Dd]>","}: {",this_tag)
sub("[Dd][Ll]>","}}\n@EndList",this_tag)
#
# Ordered or unordered list, or menu
#
if (sub("<[Oo][Ll]>","@NumberedList ",this_tag)){start_list=1}
if (sub("<[Uu][Ll]>","@BulletList ",this_tag)){start_list=1}
if (sub("<[Mm][Ee][Nn][Uu]>","@BulletList ",this_tag)){start_list=1}
if (start_list) {
if (sub("<[Ll][Ii]>","@ListItem { ",this_tag)){ start_list=0 }
}else{
sub("<[Ll][Ii]>","}\n@ListItem { ",this_tag)
}
sub("[Oo][Ll]>","}\n@EndList",this_tag)
sub("[Uu][Ll]>","}\n@EndList",this_tag)
sub("[Mm][Ee][Nn][Uu]>","}\n@EndList",this_tag)
#
# Table
#
# Start of table
if (match(this_tag,table_tag)) {
printf "#\n# Table Starts Here\n#\n"
nrows=0
start_table=1
rule="no"
if (match(this_tag,bord_tag)) {
rule="yes"
}
this_tag="@Table\n @Location { TryAfterLine }"
}
# Caption
sub("<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>", \
" @CaptionPos { Above }\n @Caption { ",this_tag)
sub("[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>"," }",this_tag)
# Table row
new_row=(match(this_tag,"<[Tt][Rr]>") || \
match(this_tag,"[Tt][Aa][Bb][Ll][Ee]>"))
if (new_row) {
if (match(this_tag,"<[Tt][Rr]>")) {
nrows++
if (start_table) {
start_table=0
this_tag="@Tbl\n rule { " rule " }\n{\n"
}else{
this_tag="} # new row "
lin = format " } # format\n" lin
}
ncells=0
nvirt_cells=0
format="@Row\n format {"
}else{
# End of table
this_tag " } # end of table \n}\n"
lin = format " }\n" lin
}
}else if (match(this_tag,"<[Tt][DdHh]")) {
ncells++
nvirt_cells++
col_span_length=getval(this_tag, colsp_tag)
row_span_length=getval(this_tag, rowsp_tag)
if (ncells>1) format=format " |"
if (row_span_length>1) format=format " @StartVSpan "
if (col_span_length>1) {
format=format " @StartHSpan "
nvirt_cells+=col_span_length-1
}
format=format " @Cell " cell[ncells]
for(i=2;i<=col_span_length;i++) {
format=format " | @HSpan"
}
if (ncells==1) {
this_tag=" " cell[ncells] " { "
}else{
this_tag=" } " cell[ncells] " { "
}
}
#
# All other HTML tags
#
sub("<[^>]*>","",this_tag)
#
# Print tag
#
printf " %s", this_tag
}
}
END { print "@End @Text\n"
}