include/HTMLParser/Parser.hpp

Go to the documentation of this file.
00001 #ifndef hpp_CPP_Parser_CPP_hpp
00002 #define hpp_CPP_Parser_CPP_hpp
00003 
00004 
00005 /*
00006   ==============================================================================
00007 
00008    HTML Parser for UZI
00009 
00010   ==============================================================================
00011 */
00012 
00013 // We need DTD types 
00014 #include "DTDTypes.hpp"
00015 // We need generic elements
00016 #include "GenericElements.hpp"
00017 // We need generic attributes
00018 #include "GenericAttributes.hpp"
00019 // We need streams 
00020 #include "../Streams/Streams.hpp"
00021 // We need trees
00022 #include "../Tree/NTree.hpp"
00023 // We need allocators too 
00024 #include "Allocator.hpp"
00025 // We need HTML entity tagging too
00026 #include "HTMLEscape.hpp"
00027 
00029 namespace HTML
00030 {
00032     class BuildRadix
00033     {
00034     private:
00036         uint32    radix;
00037     public:
00041         inline void addToRadix(tchar ch) throw()  { if (ch >= 'A' && ch <= 'Z') radix |= 1 << (ch - 'A');
00042                                                     if (ch >= 'a' && ch <= 'z') radix |= 1 << (ch - 'a'); }
00043 
00048         inline bool checkAgainstLowercase(tchar ch) const throw() { return (radix & (1 << (ch - 'a'))) > 0; }
00049 
00054         inline bool checkAgainstUppercase(tchar ch) const throw() { return (radix & (1 << (ch - 'A'))) > 0; }
00055         
00057         BuildRadix() : radix(0) {}
00058     };
00059 
00138     class Parser
00139     {
00140     public:
00142         enum ParsingTime
00143         {
00144             InstantParsing = 0,     
00145             DelayedParsing = 1,     
00146         };
00147 
00151     public:
00153         static const BuildRadix  radixElements;
00154     private:
00156         Stream::MemoryBufferedStream     inputStream;
00161         struct ParsingError
00162         {
00164             enum ErrorCode
00165             {
00166                 Valid                       =           0,      
00167                 
00169                 WarningBadStructure         = 0x00000001,      
00170                 WarningBadEntity            = 0x00000002,      
00171                 WarningUnknownAttribute     = 0x00000003,      
00172                 WarningBadElement           = 0x00000004,      
00173                 WarningBadAttributeValue    = 0x00000005,      
00174                 WarningDifferentLengths     = 0x00000006,      
00175                 WarningNotHTML              = 0x00000007,      
00176                 WarningEndTagExpected       = 0x00000008,      
00177                 WarningEndTagUnexpected     = 0x00000009,      
00178                 WarningEndTagMismatch       = 0x0000000A,      
00179                 WarningStartTagInstead      = 0x0000000B,      
00180                 WarningStartTagUnexpected   = 0x0000000C,      
00181 
00183                 ErrorBadStructure           = 0x80000001,      
00184                 ErrorBadEntity              = 0x80000002,      
00185                 ErrorUnknownAttribute       = 0x80000003,      
00186                 ErrorBadElement             = 0x80000004,      
00187                 ErrorBadAttributeValue      = 0x80000005,      
00188                 ErrorDocumentToBig          = 0x80000006,      
00189                 ErrorInvalidInputStream     = 0x80000700,      
00190                 ErrorNotEnoughMemory        = 0x80000701,      
00191 
00192                 ErrorNotHTML                = 0x80000711,      
00193 
00195                 ErrorMask                   = 0x80000000,      
00196 
00197             };
00198 
00200             ErrorCode       errorCode;
00202             uint32          errorPosition; 
00204             uint32          errorLength;
00206             ParsingError *  nextError;
00207 
00208 
00210             ParsingError(const ErrorCode & code, uint32 pos, uint32 len) : errorCode(code), errorPosition(pos), errorLength(len), nextError(0) {}
00212             /*virtual*/ ~ParsingError() { if (nextError) delete nextError; nextError = 0; }
00214             void chainError(ParsingError * newError) 
00215             { 
00216                 if (errorCode == Valid) 
00217                 { 
00218                     errorCode = newError->errorCode; 
00219                     errorPosition = newError->errorPosition; 
00220                     errorLength = newError->errorLength;  
00221                     nextError = newError->nextError; 
00222                     newError->nextError = 0; 
00223                     delete newError; 
00224                 } 
00225                 else if (nextError == 0) 
00226                     nextError = newError; 
00227                 else 
00228                     nextError->chainError(newError); 
00229             }
00231             const tchar * getErrorString() const
00232             {
00233                 // The following regular expression make the conversion
00234                 // ^\:b+\(\:w\)\:b+= 0x\:h,\:b+//!<\ (.+\)$
00235                 // case \1 : return "\2";
00236                 switch(errorCode)
00237                 {
00238                 case Valid:
00239                     return "No error, the document is valid";
00240                 case WarningBadStructure: 
00241                     return "The document is badly structured, see the content of errorPosition for details where";
00242                 case WarningBadEntity: 
00243                     return "The document contains a bad entity (not declared in current standard), see content of errorPosition for details where";
00244                 case WarningUnknownAttribute: 
00245                     return "One attribute for an element is unknown, see the content of errorPosition for details where";
00246                 case WarningBadElement: 
00247                     return "The document contains a bad element (not declared in current standard), see content of errorPosition for details where";
00248                 case WarningBadAttributeValue: 
00249                     return "The document contains a bad attribute value (not declared in current standard), see content of errorPosition for details where";
00250                 case WarningDifferentLengths: 
00251                     return "The stream length is not the same as the detected text length, see errorPosition for detected text length and errorLength for declared stream length";
00252                 case WarningNotHTML: 
00253                     return "The document doesn't start by an HTML element as root";
00254                 case WarningEndTagExpected:
00255                     return "An end tag was expected, see the content of errorPosition for details where";             
00256                 case WarningEndTagUnexpected:
00257                     return "An unexpected end tag has been found, see the content of errorPosition for details where";
00258                 case WarningEndTagMismatch:
00259                     return "An mismatching end tag has been found, see the content of errorPosition for details where";
00260                 case WarningStartTagInstead:
00261                     return "An end tag was expected but a start tag has been found, see the content of errorPosition for details where";
00262                 case WarningStartTagUnexpected:
00263                     return "An unexpected start tag has been found, see the content of errorPosition for details where";
00264 
00265                     
00266                 case ErrorBadStructure: 
00267                     return "The document is badly structured, see the content of errorPosition for details where";
00268                 case ErrorBadEntity: 
00269                     return "The document contains a bad entity (not declared in current standard), see content of errorPosition for details where";
00270                 case ErrorUnknownAttribute: 
00271                     return "One attribute for an element is unknown, see the content of errorPosition for details where";
00272                 case ErrorBadElement: 
00273                     return "The document contains a bad element (not declared in current standard), see content of errorPosition for details where";
00274                 case ErrorBadAttributeValue: 
00275                     return "The document contains a bad attribute value (not declared in current standard), see content of errorPosition for details where";
00276                 case ErrorDocumentToBig: 
00277                     return "The document is far too big to be parsed, set errorLength for declared document size";
00278                 case ErrorInvalidInputStream: 
00279                     return "The input stream is invalid or can't be read";
00280                 case ErrorNotEnoughMemory:
00281                     return "Not enough memory to allocate elements";
00282                 case ErrorNotHTML: 
00283                     return "The given document is clearly not HTML";
00284                 default:
00285                     return "This error code is unknown";
00286                 }
00287             }
00289             inline bool isError() const
00290             {
00291                 if ((uint32)errorCode & (uint32)ErrorMask) return true;
00292                 else if (nextError) return nextError->isError();
00293                 return false;
00294             }
00296             inline bool isWarning() const
00297             {
00298                 if (((uint32)errorCode & (uint32)ErrorMask) == 0 && errorCode != Valid ) return true;
00299                 else if (nextError) return nextError->isWarning();
00300                 return false;
00301             }
00303             inline uint32 chainedErrorCount() const
00304             {
00305                 if (nextError) return nextError->chainedErrorCount() + 1;
00306                 return 1;
00307             }
00308         };
00309 
00311         DOMTree                     DOMtree;
00313         const CharsetFunctions &    charsetFunctions;
00317         DOM::NodeDeleter            parserDeleter;
00319         ParsingTime                 parsingTime;
00321         HTML::DTDType               chosenDTD;
00323         ParsingError                lastParsingError;
00324         
00325 
00326     public:
00333         Parser(Stream::InputStream & inputStreamRef, HTML::Elements::Allocators::BaseAllocator & allocatorRef, const ParsingTime & whenToParse = InstantParsing, const HTML::DTDType & dtd = HTML::StandardDTD);
00335         ~Parser() 
00336         {
00337             // The DOM tree is automaticcally deleted by the DOM::NodeDeleter instance
00338         }    
00339         
00340     
00344         const ParsingError & Parse(); 
00345 
00347         inline const ParsingError & getLastParsingError() const { return lastParsingError; }
00348 
00350         inline HTML::DOMTree & getDOMTree() { return DOMtree; }
00352         inline const HTML::DOMTree & getDOMTree() const { return DOMtree; }
00353 
00357         const unsigned char * getUnboundedAccessToStream(const uint32 startPos) { if (startPos < inputStream.fullSize()) return inputStream.getBuffer() + startPos; return 0; }
00358 
00359         // Helpers
00360     private:
00362         uint32 parseAttribute(const tchar * buffer, uint32 & currentPosition, int & remainingLength, int & attribStartPos, int & attribEndPos, int & attribContentStart, int & attribContentEnd);
00364         void parseAttributeIntoNode(DOMTree::Node * newNode, const tchar * buffer, uint32 & currentPosition, int & remainingLength, int & attribStartPos, int & attribEndPos, int & attribContentStart, int & attribContentEnd);
00365     };
00366 
00367 
00368 }
00369 
00370 
00371 
00372 /* All CSS2 properties below
00373 TODO: Write a CSS 2.0 parser
00374 accelerator
00375 azimuth
00376 background
00377 background-attachment
00378 background-color
00379 background-image
00380 background-position
00381 background-position-x
00382 background-position-y
00383 background-repeat
00384 behavior
00385 border
00386 border-bottom
00387 border-bottom-color
00388 border-bottom-style
00389 border-bottom-width
00390 border-collapse
00391 border-color
00392 border-left
00393 border-left-color
00394 border-left-style
00395 border-left-width
00396 border-right
00397 border-right-color
00398 border-right-style
00399 border-right-width
00400 border-spacing
00401 border-style
00402 border-top
00403 border-top-color
00404 border-top-style
00405 border-top-width
00406 border-width
00407 bottom
00408 caption-side
00409 clear
00410 clip
00411 color
00412 content
00413 counter-increment
00414 counter-reset
00415 cue
00416 cue-after
00417 cue-before
00418 cursor
00419 direction
00420 display
00421 elevation
00422 empty-cells
00423 filter
00424 float
00425 font
00426 font-family
00427 font-size
00428 font-size-adjust
00429 font-stretch
00430 font-style
00431 font-variant
00432 font-weight
00433 height
00434 ime-mode
00435 include-source
00436 layer-background-color
00437 layer-background-image
00438 layout-flow
00439 layout-grid
00440 layout-grid-char
00441 layout-grid-char-spacing
00442 layout-grid-line
00443 layout-grid-mode
00444 layout-grid-type
00445 left
00446 letter-spacing
00447 line-break
00448 line-height
00449 list-style
00450 list-style-image
00451 list-style-position
00452 list-style-type
00453 margin
00454 margin-bottom
00455 margin-left
00456 margin-right
00457 margin-top
00458 marker-offset
00459 marks
00460 max-height
00461 max-width
00462 min-height
00463 min-width
00464 -moz-binding
00465 -moz-border-radius
00466 -moz-border-radius-topleft
00467 -moz-border-radius-topright
00468 -moz-border-radius-bottomright
00469 -moz-border-radius-bottomleft
00470 -moz-border-top-colors
00471 -moz-border-right-colors
00472 -moz-border-bottom-colors
00473 -moz-border-left-colors
00474 -moz-opacity
00475 -moz-outline
00476 -moz-outline-color
00477 -moz-outline-style
00478 -moz-outline-width
00479 -moz-user-focus
00480 -moz-user-input
00481 -moz-user-modify
00482 -moz-user-select
00483 orphans
00484 outline
00485 outline-color
00486 outline-style
00487 outline-width
00488 overflow
00489 overflow-X
00490 overflow-Y
00491 padding
00492 padding-bottom
00493 padding-left
00494 padding-right
00495 padding-top
00496 page
00497 page-break-after
00498 page-break-before
00499 page-break-inside
00500 pause
00501 pause-after
00502 pause-before
00503 pitch
00504 pitch-range
00505 play-during
00506 position
00507 quotes
00508 -replace
00509 richness
00510 right
00511 ruby-align
00512 ruby-overhang
00513 ruby-position
00514 -set-link-source
00515 size
00516 speak
00517 speak-header
00518 speak-numeral
00519 speak-punctuation
00520 speech-rate
00521 stress
00522 scrollbar-arrow-color
00523 scrollbar-base-color
00524 scrollbar-dark-shadow-color
00525 scrollbar-face-color
00526 scrollbar-highlight-color
00527 scrollbar-shadow-color
00528 scrollbar-3d-light-color
00529 scrollbar-track-color
00530 table-layout
00531 text-align
00532 text-align-last
00533 text-decoration
00534 text-indent
00535 text-justify
00536 text-overflow
00537 text-shadow
00538 text-transform
00539 text-autospace
00540 text-kashida-space
00541 text-underline-position
00542 top
00543 unicode-bidi
00544 -use-link-source
00545 vertical-align
00546 visibility
00547 voice-family
00548 volume
00549 white-space
00550 widows
00551 width
00552 word-break
00553 word-spacing
00554 word-wrap
00555 writing-mode
00556 z-index
00557 zoom
00558   
00559 */
00560 /*
00561 class HTMLElement
00562 {
00563     String element;
00564     String attribute;
00565 
00566 
00567     
00568 };
00569 
00570 
00571 class HTMLParser
00572 {
00573     
00574 
00575 
00576 }; 
00577 */
00578 
00579 #endif
00580 

(C) An X-Ryl669 project 2007

This document describes Unlimited Zooming Interface source code. UZI stands for Unlimited Zooming Interface, and source code license is