00001 #ifndef hpp_CPP_Parser_CPP_hpp
00002 #define hpp_CPP_Parser_CPP_hpp
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "DTDTypes.hpp"
00015
00016 #include "GenericElements.hpp"
00017
00018 #include "GenericAttributes.hpp"
00019
00020 #include "../Streams/Streams.hpp"
00021
00022 #include "../Tree/NTree.hpp"
00023
00024 #include "Allocator.hpp"
00025
00026 #include "HTMLEscape.hpp"
00027
00029 namespace HTML
00030 {
00032 class BuildRadix
00033 {
00034 private:
00036 uint32 radix;
00037 public:
00041 inline void addToRadix(tchar ch) throw() { if (ch >= 'A' && ch <= 'Z') radix |= 1 << (ch - 'A');
00042 if (ch >= 'a' && ch <= 'z') radix |= 1 << (ch - 'a'); }
00043
00048 inline bool checkAgainstLowercase(tchar ch) const throw() { return (radix & (1 << (ch - 'a'))) > 0; }
00049
00054 inline bool checkAgainstUppercase(tchar ch) const throw() { return (radix & (1 << (ch - 'A'))) > 0; }
00055
00057 BuildRadix() : radix(0) {}
00058 };
00059
00138 class Parser
00139 {
00140 public:
00142 enum ParsingTime
00143 {
00144 InstantParsing = 0,
00145 DelayedParsing = 1,
00146 };
00147
00151 public:
00153 static const BuildRadix radixElements;
00154 private:
00156 Stream::MemoryBufferedStream inputStream;
00161 struct ParsingError
00162 {
00164 enum ErrorCode
00165 {
00166 Valid = 0,
00167
00169 WarningBadStructure = 0x00000001,
00170 WarningBadEntity = 0x00000002,
00171 WarningUnknownAttribute = 0x00000003,
00172 WarningBadElement = 0x00000004,
00173 WarningBadAttributeValue = 0x00000005,
00174 WarningDifferentLengths = 0x00000006,
00175 WarningNotHTML = 0x00000007,
00176 WarningEndTagExpected = 0x00000008,
00177 WarningEndTagUnexpected = 0x00000009,
00178 WarningEndTagMismatch = 0x0000000A,
00179 WarningStartTagInstead = 0x0000000B,
00180 WarningStartTagUnexpected = 0x0000000C,
00181
00183 ErrorBadStructure = 0x80000001,
00184 ErrorBadEntity = 0x80000002,
00185 ErrorUnknownAttribute = 0x80000003,
00186 ErrorBadElement = 0x80000004,
00187 ErrorBadAttributeValue = 0x80000005,
00188 ErrorDocumentToBig = 0x80000006,
00189 ErrorInvalidInputStream = 0x80000700,
00190 ErrorNotEnoughMemory = 0x80000701,
00191
00192 ErrorNotHTML = 0x80000711,
00193
00195 ErrorMask = 0x80000000,
00196
00197 };
00198
00200 ErrorCode errorCode;
00202 uint32 errorPosition;
00204 uint32 errorLength;
00206 ParsingError * nextError;
00207
00208
00210 ParsingError(const ErrorCode & code, uint32 pos, uint32 len) : errorCode(code), errorPosition(pos), errorLength(len), nextError(0) {}
00212 ~ParsingError() { if (nextError) delete nextError; nextError = 0; }
00214 void chainError(ParsingError * newError)
00215 {
00216 if (errorCode == Valid)
00217 {
00218 errorCode = newError->errorCode;
00219 errorPosition = newError->errorPosition;
00220 errorLength = newError->errorLength;
00221 nextError = newError->nextError;
00222 newError->nextError = 0;
00223 delete newError;
00224 }
00225 else if (nextError == 0)
00226 nextError = newError;
00227 else
00228 nextError->chainError(newError);
00229 }
00231 const tchar * getErrorString() const
00232 {
00233
00234
00235
00236 switch(errorCode)
00237 {
00238 case Valid:
00239 return "No error, the document is valid";
00240 case WarningBadStructure:
00241 return "The document is badly structured, see the content of errorPosition for details where";
00242 case WarningBadEntity:
00243 return "The document contains a bad entity (not declared in current standard), see content of errorPosition for details where";
00244 case WarningUnknownAttribute:
00245 return "One attribute for an element is unknown, see the content of errorPosition for details where";
00246 case WarningBadElement:
00247 return "The document contains a bad element (not declared in current standard), see content of errorPosition for details where";
00248 case WarningBadAttributeValue:
00249 return "The document contains a bad attribute value (not declared in current standard), see content of errorPosition for details where";
00250 case WarningDifferentLengths:
00251 return "The stream length is not the same as the detected text length, see errorPosition for detected text length and errorLength for declared stream length";
00252 case WarningNotHTML:
00253 return "The document doesn't start by an HTML element as root";
00254 case WarningEndTagExpected:
00255 return "An end tag was expected, see the content of errorPosition for details where";
00256 case WarningEndTagUnexpected:
00257 return "An unexpected end tag has been found, see the content of errorPosition for details where";
00258 case WarningEndTagMismatch:
00259 return "An mismatching end tag has been found, see the content of errorPosition for details where";
00260 case WarningStartTagInstead:
00261 return "An end tag was expected but a start tag has been found, see the content of errorPosition for details where";
00262 case WarningStartTagUnexpected:
00263 return "An unexpected start tag has been found, see the content of errorPosition for details where";
00264
00265
00266 case ErrorBadStructure:
00267 return "The document is badly structured, see the content of errorPosition for details where";
00268 case ErrorBadEntity:
00269 return "The document contains a bad entity (not declared in current standard), see content of errorPosition for details where";
00270 case ErrorUnknownAttribute:
00271 return "One attribute for an element is unknown, see the content of errorPosition for details where";
00272 case ErrorBadElement:
00273 return "The document contains a bad element (not declared in current standard), see content of errorPosition for details where";
00274 case ErrorBadAttributeValue:
00275 return "The document contains a bad attribute value (not declared in current standard), see content of errorPosition for details where";
00276 case ErrorDocumentToBig:
00277 return "The document is far too big to be parsed, set errorLength for declared document size";
00278 case ErrorInvalidInputStream:
00279 return "The input stream is invalid or can't be read";
00280 case ErrorNotEnoughMemory:
00281 return "Not enough memory to allocate elements";
00282 case ErrorNotHTML:
00283 return "The given document is clearly not HTML";
00284 default:
00285 return "This error code is unknown";
00286 }
00287 }
00289 inline bool isError() const
00290 {
00291 if ((uint32)errorCode & (uint32)ErrorMask) return true;
00292 else if (nextError) return nextError->isError();
00293 return false;
00294 }
00296 inline bool isWarning() const
00297 {
00298 if (((uint32)errorCode & (uint32)ErrorMask) == 0 && errorCode != Valid ) return true;
00299 else if (nextError) return nextError->isWarning();
00300 return false;
00301 }
00303 inline uint32 chainedErrorCount() const
00304 {
00305 if (nextError) return nextError->chainedErrorCount() + 1;
00306 return 1;
00307 }
00308 };
00309
00311 DOMTree DOMtree;
00313 const CharsetFunctions & charsetFunctions;
00317 DOM::NodeDeleter parserDeleter;
00319 ParsingTime parsingTime;
00321 HTML::DTDType chosenDTD;
00323 ParsingError lastParsingError;
00324
00325
00326 public:
00333 Parser(Stream::InputStream & inputStreamRef, HTML::Elements::Allocators::BaseAllocator & allocatorRef, const ParsingTime & whenToParse = InstantParsing, const HTML::DTDType & dtd = HTML::StandardDTD);
00335 ~Parser()
00336 {
00337
00338 }
00339
00340
00344 const ParsingError & Parse();
00345
00347 inline const ParsingError & getLastParsingError() const { return lastParsingError; }
00348
00350 inline HTML::DOMTree & getDOMTree() { return DOMtree; }
00352 inline const HTML::DOMTree & getDOMTree() const { return DOMtree; }
00353
00357 const unsigned char * getUnboundedAccessToStream(const uint32 startPos) { if (startPos < inputStream.fullSize()) return inputStream.getBuffer() + startPos; return 0; }
00358
00359
00360 private:
00362 uint32 parseAttribute(const tchar * buffer, uint32 & currentPosition, int & remainingLength, int & attribStartPos, int & attribEndPos, int & attribContentStart, int & attribContentEnd);
00364 void parseAttributeIntoNode(DOMTree::Node * newNode, const tchar * buffer, uint32 & currentPosition, int & remainingLength, int & attribStartPos, int & attribEndPos, int & attribContentStart, int & attribContentEnd);
00365 };
00366
00367
00368 }
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578
00579 #endif
00580