Ignore:
Timestamp:
06/06/06 13:24:29 (14 years ago)
Author:
lawrence
Message:

This has the code to support a temporary patch for
the unicode and orphan html characters problems. Further
details are in ticket:311

File:
1 edited

Legend:

Unmodified
Added
Removed
  • TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/ETxmlView.py

    r996 r1097  
    88import ElementTree as ET 
    99import re 
     10from sub_orphan import * 
    1011 
    1112def et2text(elem,indent='',html=0,space='   '): 
     
    6465        result+='%s/%s%s'%(lt,span(elem.tag,'xmlElemTag'),gt) 
    6566    return div(result,'xmlElem') 
     67     
     68def loadET(inputString): 
     69    ''' This method returns an elementtree object after some cleaning  
     70    of the string, essentially a hack to make sure that xml doesn't contain any  
     71    naughty & characters alone (typically from URL copies), or unescaped orphan  
     72    < or > signs ... and that the unicode has been processed to something that  
     73    might work''' 
     74      
     75    inputString=re.sub(r'&(?!\w+;)', '&amp;', inputString) 
     76     
     77    # first just try and do it so we don't waste time if we don't need to ... 
     78    try: 
     79        elem=ET.fromstring(inputString) 
     80        return elem 
     81    except: 
     82        pass 
     83        # and carry on  
    6684 
    67 def xmlCleanup(string): 
    68     '''This is a hack to make sure that xml doesn't contain any naughty & characters alone 
    69     (typically from URL copies)''' 
    70     #match ? but not (?! if it's followed by alphanumeric \w characters + and a semicolon ; 
    71     #don't want to use urlencode cos we don't know where it came from ... 
    72     return re.sub(r'&(?!\w+;)', '&amp;', string) 
     85    #ok, let's deal with orphan > and < signs then ... 
     86    subtool=subAI() 
     87    s=subtool.sub(inputString) 
     88 
     89    #now let's sort out an encoding 
     90    encodings=['utf-8','latin-1','iso-8859-1','ascii',] 
     91    elem=None 
     92    for option in encodings: 
     93        try: 
     94            s=s.encode(option,'replace') 
     95            try: 
     96                elem=ET.fromstring(s) 
     97            except: 
     98                s=re.sub('\n','<br/>LINE: ',s) 
     99                print '<p>%s</p>'%s 
     100                raise 
     101        except UnicodeError: 
     102            pass 
     103        else: 
     104            break 
     105    return elem 
    73106 
    74107def xml2text(xmlString): 
    75     tree=ET.fromstring(xmlCleanup(xmlString)) 
     108    tree=loadET(xmlString) 
    76109    return et2text(tree) 
    77110 
    78111def xml2HTML(xmlString,**kw): 
    79     tree=ET.fromstring(xmlCleanup(xmlString)) 
     112    tree=loadET(xmlString) 
    80113    return et2HTML(tree,**kw) 
Note: See TracChangeset for help on using the changeset viewer.