Changeset 1097 for TI07-MOLES


Ignore:
Timestamp:
06/06/06 13:24:29 (13 years ago)
Author:
lawrence
Message:

This has the code to support a temporary patch for
the unicode and orphan html characters problems. Further
details are in ticket:311

Location:
TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/DIF.py

    r1049 r1097  
    88from ServiceBinding import Service 
    99from renderEntity import renderEntity 
     10from ETxmlView import loadET 
    1011 
    1112class DIF: 
     
    1617        '''Initialise a python dif instance based on an xml document ''' 
    1718        self.metadataType='DIF' 
    18         #self.xml=xml.decode('utf-8','replace') 
    19         #self.xml=unicode(xml,'latin-1') 
    20         #self.xml=xml.encode('latin-1','replace') 
    21         encodings=['ascii',]#['latin-1','ascii','iso-8859-1'] 
    2219        try: 
    23             for option in encodings: 
    24                 try: 
    25                     s=xml.encode(option,'replace') 
    26                      
    27                     self.elem=ET.fromstring(s) 
    28                     self.xml=s 
    29                 except UnicodeError: 
    30                     pass 
    31                 else: 
    32                     break 
     20            self.elem=loadET(xml) 
     21            self.xml=xml 
    3322        except: 
    3423            # for some reason we can't parse the document ... 
    35             #print '<p>%s</p>'%xml[11385:11395] 
    36             #raise bnl 
    37             #import codecs 
    38             #f=codecs.open('unparseable.xml','a','utf-8') 
    39             #f.write(xml+'\n') 
    40             print '<p>'+s[11360:11420]+'</p>' 
    41             raise  
     24            raise 
    4225            self.elem=None 
    4326            return 
  • TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/ETxmlView.py

    r996 r1097  
    88import ElementTree as ET 
    99import re 
     10from sub_orphan import * 
    1011 
    1112def et2text(elem,indent='',html=0,space='   '): 
     
    6465        result+='%s/%s%s'%(lt,span(elem.tag,'xmlElemTag'),gt) 
    6566    return div(result,'xmlElem') 
     67     
     68def loadET(inputString): 
     69    ''' This method returns an elementtree object after some cleaning  
     70    of the string, essentially a hack to make sure that xml doesn't contain any  
     71    naughty & characters alone (typically from URL copies), or unescaped orphan  
     72    < or > signs ... and that the unicode has been processed to something that  
     73    might work''' 
     74      
     75    inputString=re.sub(r'&(?!\w+;)', '&amp;', inputString) 
     76     
     77    # first just try and do it so we don't waste time if we don't need to ... 
     78    try: 
     79        elem=ET.fromstring(inputString) 
     80        return elem 
     81    except: 
     82        pass 
     83        # and carry on  
    6684 
    67 def xmlCleanup(string): 
    68     '''This is a hack to make sure that xml doesn't contain any naughty & characters alone 
    69     (typically from URL copies)''' 
    70     #match ? but not (?! if it's followed by alphanumeric \w characters + and a semicolon ; 
    71     #don't want to use urlencode cos we don't know where it came from ... 
    72     return re.sub(r'&(?!\w+;)', '&amp;', string) 
     85    #ok, let's deal with orphan > and < signs then ... 
     86    subtool=subAI() 
     87    s=subtool.sub(inputString) 
     88 
     89    #now let's sort out an encoding 
     90    encodings=['utf-8','latin-1','iso-8859-1','ascii',] 
     91    elem=None 
     92    for option in encodings: 
     93        try: 
     94            s=s.encode(option,'replace') 
     95            try: 
     96                elem=ET.fromstring(s) 
     97            except: 
     98                s=re.sub('\n','<br/>LINE: ',s) 
     99                print '<p>%s</p>'%s 
     100                raise 
     101        except UnicodeError: 
     102            pass 
     103        else: 
     104            break 
     105    return elem 
    73106 
    74107def xml2text(xmlString): 
    75     tree=ET.fromstring(xmlCleanup(xmlString)) 
     108    tree=loadET(xmlString) 
    76109    return et2text(tree) 
    77110 
    78111def xml2HTML(xmlString,**kw): 
    79     tree=ET.fromstring(xmlCleanup(xmlString)) 
     112    tree=loadET(xmlString) 
    80113    return et2HTML(tree,**kw) 
  • TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/browseCGI.py

    r1050 r1097  
    230230                results=ws.GetResults(offset=state.offset,number=state.stride) 
    231231                difs=[] 
    232                 for result in results: 
    233                     difs.append(xmlCleanup(result)) 
     232                for result in results: difs.append(result) 
    234233                html=renderDiscoverySet(difs,state,selector=self.selector, 
    235234                               summary=1,spatial=1,temporal=1,services=1) 
  • TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/renderDiscoverySet.py

    r1023 r1097  
    132132    for result in results: 
    133133        #g.write(xmlCleanup(result)+'\n') 
    134         difs.append(xmlCleanup(result)) 
     134        difs.append(result) 
    135135    html=renderDiscoverySet(difs,state,summary=1,spatial=1,temporal=1) 
    136136    f=file('output.html','wb') 
Note: See TracChangeset for help on using the changeset viewer.