# # Second attempt at HTML to Lout converter # Version of 03-May-2001 # # Protect special Lout characters by quoting them # function protect(lin) { gsub("@","\"@\"",lin) gsub("\\|","\"|\"",lin) gsub("#","\"#\"",lin) gsub("{","\"{\"",lin) gsub("}","\"}\"",lin) gsub("\\^","\"^\"",lin) gsub("~","\"~\"",lin) gsub("/","\"/\"",lin) gsub(/\\/,"\"\\\\\"",lin) gsub("\"\"","",lin) return lin } # # Translate safe HTML reserved/special characters esp ISO-LATIN # function special(lin) { if (lin !~ /\&/) return lin gsub("&nsbp;","",lin) gsub("&[Aa][Mm][Pp];","\"\\&\"",lin) gsub("Á","{@Char Aacute}",lin) gsub("á","{@Char aacute}",lin) gsub("À","{@Char Agrave}",lin) gsub("à","{@Char agrave}",lin) gsub("Â","{@Char Acircumflex}",lin) gsub("â","{@Char acircumflex}",lin) gsub("Ä","{@Char Adieresis}",lin) gsub("ä","{@Char adieresis}",lin) gsub("Ç","{@Char Ccedilla}",lin) gsub("ç","{@Char ccedilla}",lin) gsub("É","{@Char Eacute}",lin) gsub("é","{@Char eacute}",lin) gsub("È","{@Char Egrave}",lin) gsub("è","{@Char egrave}",lin) gsub("Ê","{@Char Ecircumflex}",lin) gsub("ê","{@Char ecircumflex}",lin) gsub("Ö","{@Char Odieresis}",lin) gsub("Í","{@Char Iacute}",lin) gsub("í","{@Char iacute}",lin) gsub("Ì","{@Char Igrave}",lin) gsub("ì","{@Char igrave}",lin) gsub("Î","{@Char Icircumflex}",lin) gsub("î","{@Char icircumflex}",lin) gsub("ö","{@Char odieresis}",lin) gsub("ü","{@Char udieresis}",lin) gsub("&[Gg][Tt];",">",lin) gsub("&[Ll][Tt];","<",lin) gsub("&#[0-9][0-9][0-9];"," ",lin) gsub("&[A-Za-z]+;","x",lin) gsub("&[^&;]+;"," ",lin) return lin } # # Simple tag conversion # function simple_convert(this_tag) { # Remove excess whitespace gsub("[ ]+>",">",this_tag) gsub("<[ ]+","<",this_tag) # Common if (this_tag ~ /[ ]+"," ",this_tag)) return this_tag if (sub("^<[Pp]>","@LP\n",this_tag)) return this_tag if (sub("<[Pp]>$","\n@LP",this_tag)) return this_tag if (sub("<[Pp]>"," @LP ",this_tag)) return this_tag if (sub("<[Bb]>","@B { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Ss][Tt][Rr][Oo][Nn][Gg]>","@B { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Ii]>","@I { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Ee][Mm]>","@B { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Hh]1>","@LeftDisplay @Heading @Underline {",this_tag)) return this_tag if (sub("<[Hh][23456]>","@LeftDisplay @Heading {",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Uu]>","@Underline { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag # Less common if (sub("<[Bb][Rr]>","\n@LLP ",this_tag)) return this_tag if (sub("<[Cc][Ee][Nn][Tt][Ee][Rr]>","clines @Break {",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Hh][Rr]>","@LP",this_tag)) return this_tag if (sub("<[Kk][Bb][Dd]>","@F lines @Break { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Pp][Rr][Ee]>","@F lines @Break {",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Ss][Uu][Pp]>"," @Sup{",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Ss][Uu][Bb]>"," @Sub{",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag if (sub("<[Tt][Tt]>","@F lines @Break { ",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag # Rare if (sub("<[Bb][Oo][Dd][Yy]>","",this_tag)) return this_tag if (sub("","",this_tag)) return this_tag if (sub("<[Hh][Ee][Aa][Dd]>","",this_tag)) return this_tag if (sub("","",this_tag)) return this_tag if (sub("<[Hh][Tt][Mm][Ll]>","# html\n",this_tag)) return this_tag if (sub("","",this_tag)) return this_tag if (sub("<[Tt][Ii][Tt][Ll][Ee]>","@Display @Heading @Underline {",this_tag)) return this_tag if (sub("","}",this_tag)) return this_tag return this_tag } function getval(this_tag, var_name, var, value) { if (match(this_tag,var_name)) { var=substr(this_tag,RSTART+RLENGTH+1) gsub("\"","",var) split(var,value) var=value[1] }else{ var="" } return var } # # Initialise # BEGIN { empty=0 quote=1; q[1]="``"; q[2]="''" letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ" for (i=1;i<=26;i++) { cell[i]=substr(letters,i,1) } # # Cleaner approach to text buffer # Requires the RT variable to be available # RS="[<>]" # # Cleaner approach to tables # table_tag="<[Tt][Aa][Bb][Ll][Ee][A-Za-z0-9\"\`\'= ]*>" rowsp_tag="[Rr][Oo][Ww][Ss][Pp][Aa][Nn]=" colsp_tag="[Cc][Oo][Ll][Ss][Pp][Aa][Nn]=" bord_tag="[Bb][Oo][Rr][Dd][Ee][Rr]=" } # # Header: put here so access FILENAME # NR==1 { printf "#\n# Lout file prepared from html file \"" FILENAME "\"\n#\n" printf "@SysInclude { tbl }\n@SysInclude { doc }\n@Doc @Text @Begin\n\n" } { if (RT=="<") { this_text=$0 # # Process text # Convert quote " to Lout quotation marks `` and ''. # Quote Lout reserved characters in the text # while(sub("\"",q[quote],this_text)) { quote=3-quote } this_text=protect(this_text) this_text=special(this_text) printf "%s", this_text }else if ($0 ~ /[A-Za-z]/) { this_tag="<" $0 ">" # # Process HTML tag # # Simple tags first: # this_tag=simple_convert(this_tag) # # Definition list better treated as heading followed by # optional definition in rest of entry (driven by one example I have) # if (sub("<[Dd][Ll]>","@IndentedList",this_tag)){ start_dl=1 } if (start_dl) { if (sub("<[Dd][Tt]>","@ListItem { @B { ",this_tag)) { start_dl=0 } }else{ sub("<[Dd][Tt]>","}}\n@ListItem { @B { ",this_tag) } sub("<[Dd][Dd]>","}: {",this_tag) sub("","}}\n@EndList",this_tag) # # Ordered or unordered list, or menu # if (sub("<[Oo][Ll]>","@NumberedList ",this_tag)){start_list=1} if (sub("<[Uu][Ll]>","@BulletList ",this_tag)){start_list=1} if (sub("<[Mm][Ee][Nn][Uu]>","@BulletList ",this_tag)){start_list=1} if (start_list) { if (sub("<[Ll][Ii]>","@ListItem { ",this_tag)){ start_list=0 } }else{ sub("<[Ll][Ii]>","}\n@ListItem { ",this_tag) } sub("","}\n@EndList",this_tag) sub("","}\n@EndList",this_tag) sub("","}\n@EndList",this_tag) # # Table # # Start of table if (match(this_tag,table_tag)) { printf "#\n# Table Starts Here\n#\n" nrows=0 start_table=1 rule="no" if (match(this_tag,bord_tag)) { rule="yes" } this_tag="@Table\n @Location { TryAfterLine }" } # Caption sub("<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>", \ " @CaptionPos { Above }\n @Caption { ",this_tag) sub(""," }",this_tag) # Table row new_row=(match(this_tag,"<[Tt][Rr]>") || \ match(this_tag,"")) if (new_row) { if (match(this_tag,"<[Tt][Rr]>")) { nrows++ if (start_table) { start_table=0 this_tag="@Tbl\n rule { " rule " }\n{\n" }else{ this_tag="} # new row " lin = format " } # format\n" lin } ncells=0 nvirt_cells=0 format="@Row\n format {" }else{ # End of table this_tag " } # end of table \n}\n" lin = format " }\n" lin } }else if (match(this_tag,"<[Tt][DdHh]")) { ncells++ nvirt_cells++ col_span_length=getval(this_tag, colsp_tag) row_span_length=getval(this_tag, rowsp_tag) if (ncells>1) format=format " |" if (row_span_length>1) format=format " @StartVSpan " if (col_span_length>1) { format=format " @StartHSpan " nvirt_cells+=col_span_length-1 } format=format " @Cell " cell[ncells] for(i=2;i<=col_span_length;i++) { format=format " | @HSpan" } if (ncells==1) { this_tag=" " cell[ncells] " { " }else{ this_tag=" } " cell[ncells] " { " } } # # All other HTML tags # sub("<[^>]*>","",this_tag) # # Print tag # printf " %s", this_tag } } END { print "@End @Text\n" }