6.17.2.1. HTML Parser

We provide an HTML meta tangler. The sgml_wrapper class maps writeline calls from the control algorithm to feed calls of on the parser.

Construction of an html_filter object feeds the parser with an initial <HTML> tag. Termination semantics are as follows: if the external data source becomes exhausted, processing of buffered data should be forced by calling the close method of the sgml_wrapper or html_filter object.

If a </HTML> ending tag is detected, an eoi exception is thrown.

In either case, reset() me be called to reinitialise the object state, or the object can be destroyed. Note that the current implementation dispatches tags to global methods, rather than to a weaver bound to the object. This is to permit bindings to Interscript operations other than weaving.

Embedded Python and Tcl (if supported) can be executed in the Interscript environment using the <SCRIPT> tag as follows:

  <SCRIPT LANGUAGE="Python"><!--
    print "Hello World"
  #-->
  </SCRIPT>
  <SCRIPT LANGUAGE="Tcl"><!--
    puts "Hello World"
  #-->
Note that the used of comments is _not_ optional. If an error is detected during execution, a diagnostic will be printed but will not terminate continued processing of the document beyond the ending SCRIPT tag.

Start python section to interscript/parsers/html.py[1 /1 ]
     1: #line 39 "html_parser.ipk"
     2: from interscript.drivers.sources.base import eoi
     3: import string
     4: import traceback
     5: class sgml_wrapper:
     6:   def __init__(self, sgml):
     7:     self.sgml = sgml
     8: 
     9:   def writeline(self,data,file,count):
    10:     self.sgml.feed(data)
    11: 
    12:   def close():
    13:     self.sgml.close(self)
    14: 
    15:   def reset(self):
    16:     self.sgml.reset()
    17: 
    18: # this is a hack: sgmllib needs to be imported here
    19: # so the class SGMLParser defined in it can be used as a base
    20: import sgmllib
    21: 
    22: class html_filter(sgmllib.SGMLParser):
    23:   def __init__(self, input_frame):
    24:     sgmllib.SGMLParser.__init__(self)
    25:     self.save_data = 0
    26:     self.script_language = ''
    27:     self.input_frame = input_frame
    28:     self.weaver = input_frame.get_weaver()
    29:     self.process = input_frame.process
    30:     # feeding <HTML> in here is a hack to get around a bug in sgmllib,
    31:     # which fails to process unbalanced end tags correctly
    32:     self.feed('<HTML>')
    33: 
    34:   def _save(self):
    35:     self.save_data = 1
    36:     self.saved_data = ''
    37:   def _saved(self):
    38:     self.save_data = 0
    39:     return self.saved_data
    40: 
    41:   def handle_data(self,data):
    42:     new_data = ''
    43:     for ch in data:
    44:       if ch == '\n': ch = ' \n'
    45:       new_data = new_data + ch
    46:     if self.save_data:
    47:       self.saved_data = self.saved_data + new_data
    48:     else:
    49:       self.weaver.write(new_data)
    50: 
    51:   def handle_comment(self,data):
    52:     if 'parsers' in self.process.trace:
    53:       print 'SGML comment',data
    54:     if self.script_language != '':
    55:       self.saved_comments = self.saved_comments + data
    56: 
    57:   def start_html(self, attributes): pass
    58:   def start_head(self, attributes): pass
    59:   def end_head(self): pass
    60:   def start_body(self, attributes): pass
    61:   def end_body(self): pass
    62:   def end_html(self):
    63:     del self.input_frame
    64:     del self.weaver
    65:     raise eoi
    66: 
    67: # fonts
    68:   def start_b(self,attributes): self.weaver.begin_bold()
    69:   def end_b(self): self.weaver.end_bold()
    70: 
    71:   def start_i(self,attributes): self.weaver.begin_italic()
    72:   def end_i(self): self.weaver.end_italic()
    73: 
    74:   def start_em(self,attributes): self.weaver.begin_emphasize()
    75:   def end_em(self): self.weaver.end_emphasize()
    76: 
    77:   def start_strong(self,attributes): self.weaver.begin_strong()
    78:   def end_strong(self): self.weaver.end_strong()
    79: 
    80:   def start_small(self,attributes): self.weaver.begin_small()
    81:   def end_small(self): self.weaver.end_small()
    82: 
    83:   def start_big(self,attributes): self.weaver.begin_big()
    84:   def end_big(self): self.weaver.end_big()
    85: 
    86:   def start_code(self,attributes): self.weaver.begin_code()
    87:   def end_code(self): self.weaver.end_code()
    88: 
    89: # paragraphs
    90:   def start_p(self,attributes): self.weaver.prose()
    91:   def end_p(self): self.weaver.eop()
    92: 
    93: # displays
    94:   def start_pre(self,attributes): self.weaver.begin_displayed_code()
    95:   def end_pre(self): self.weaver.end_displayed_code()
    96: 
    97: #lists
    98:   def start_ol(self,attributes):
    99:     self.weaver.begin_numbered_list()
   100:     self.list_kind = 'ol'
   101:   def end_ol(self):
   102:     self.weaver.end_numbered_list()
   103: 
   104:   def start_dl(self,attributes):
   105:     self.weaver.begin_keyed_list()
   106:     self.list_kind = 'dl'
   107:   def end_dl(self):
   108:     self.weaver.end_keyed_list()
   109: 
   110:   def start_ul(self,attributes):
   111:     self.weaver.begin_bullet_list()
   112:     self.list_kind = 'ul'
   113:   def end_ul(self):
   114:     self.weaver.end_bullet_list()
   115: 
   116: #list items
   117:   def start_li(self,attributes):
   118:     if self.list_kind == 'ol':
   119:       self.weaver.begin_numbered_list_item()
   120:     else:
   121:       self.weaver.begin_bullet_list_item()
   122: 
   123:   def end_li(self):
   124:     if self.list_kind == 'ol':
   125:       self.weaver.end_numbered_list_item()
   126:     else:
   127:       self.weaver.end_bullet_list_item()
   128: 
   129:   def start_dt(self,attributes): self._save()
   130:   def end_dt(self):
   131:     self.weaver.begin_keyed_list_item(self._saved())
   132: 
   133:   def start_dd(self,attributes): pass
   134:   def end_dd(self): self.weaver.end_keyed_list_item()
   135: 
   136: #headings
   137:   def start_h1(self,attributes): self._save()
   138:   def end_h1(self): self.weaver.head(1,self._saved())
   139: 
   140:   def start_h2(self,attributes): self._save()
   141:   def end_h2(self): self.weaver.head(2,self._saved())
   142: 
   143:   def start_h3(self,attributes): self._save()
   144:   def end_h3(self): self.weaver.head(3,self._saved())
   145: 
   146:   def start_h4(self,attributes): self._save()
   147:   def end_h4(self): self.weaver.head(4,self._saved())
   148: 
   149:   def start_h5(self,attributes): self._save()
   150:   def end_h5(self): self.weaver.head(5,self._saved())
   151: 
   152:   def start_h6(self,attributes): self._save()
   153:   def end_h6(self): self.weaver.head(6,self._saved())
   154: 
   155:   def unknown_starttag(self,tag,attributes):
   156:     print 'UNKNOWN START TAG',tag,attributes
   157: 
   158:   def unknown_endtag(self,tag):
   159:     print 'UNKNOWN END TAG',tag
   160: 
   161:   def unknown_charref(self,ref):
   162:     print 'BAD CHAR REF',ref
   163: 
   164:   def unknown_entityref(self,ref):
   165:     print 'UNKNOWN ENTITY REF',ref
   166: 
   167:   # due to a bug in sgmllib, this routine will
   168:   # never be called
   169:   def report_unbalanced(self,tag):
   170:     print 'LONELY ENDTAG',tag
   171: 
   172:   def start_script(self,attributes):
   173:     if 'parsers' in self.process.trace:
   174:       print 'start of script'
   175:     for param, value in attributes:
   176:       if string.lower(param) == 'language':
   177:         self.script_language = string.lower(value)
   178:         self.saved_comments = ''
   179: 
   180:   def end_script(self):
   181:     if 'parsers' in self.process.trace:
   182:       print 'end of script'
   183:     if self.script_language == 'python':
   184:       try:
   185:         exec self.saved_comments in globals(),self.input_frame.userdict
   186:       except:
   187:         print "Error executing python <SCRIPT>"
   188:         traceback.print_exc()
   189:     else:
   190:       print 'Sorry',self.script_language,'not available'
End python section to interscript/parsers/html.py[1]