General beast update, fixes, optimizations, features:

* Clean ups, optimizations, and new File::commonDocumentsDirectory enum
* Replace sortArray with std::sort for performance
* More error tolerance in XML parser, speedups
* Refactor some byte-order mark detection code
* Add String::appendCharPointer overloads
* More XML parser optimisations and better error detection
* Misc performance tweaks
* Fixes for support of non utf8 strings
* Increased precision when storing strings in XmlElement
* Minor clean-ups
* Minor fix to XmlDocument
* Cleanups to CriticalSection and related synchronization primitives
* Fix DynamicArray unit test
This commit is contained in:
Vinnie Falco
2013-09-10 09:52:24 -07:00
parent 27307fca0c
commit 43e6d345e4
83 changed files with 433 additions and 687 deletions

View File

@@ -24,12 +24,18 @@
XmlDocument::XmlDocument (const String& documentText)
: originalText (documentText),
input (nullptr),
outOfData (false),
errorOccurred (false),
needToLoadDTD (false),
ignoreEmptyTextElements (true)
{
}
XmlDocument::XmlDocument (const File& file)
: input (nullptr),
outOfData (false),
errorOccurred (false),
needToLoadDTD (false),
ignoreEmptyTextElements (true),
inputSource (new FileInputSource (file))
{
@@ -77,68 +83,69 @@ namespace XmlIdentifierChars
: isIdentifierCharSlow (c);
}
/*static void generateIdentifierCharConstants()
/*
static void generateIdentifierCharConstants()
{
uint32 n[8] = { 0 };
for (int i = 0; i < 256; ++i)
if (isIdentifierCharSlow (i))
n[i >> 5] |= (1 << (i & 31));
if (isIdentifierCharSlow (i))
n[i >> 5] |= (1 << (i & 31));
String s;
for (int i = 0; i < 8; ++i)
s << "0x" << String::toHexString ((int) n[i]) << ", ";
DBG (s);
}*/
}
*/
static String::CharPointerType findEndOfToken (String::CharPointerType p)
{
while (isIdentifierChar (*p))
++p;
return p;
}
}
XmlElement* XmlDocument::getDocumentElement (const bool onlyReadOuterDocumentElement)
{
String textToParse (originalText);
if (textToParse.isEmpty() && inputSource != nullptr)
if (originalText.isEmpty() && inputSource != nullptr)
{
ScopedPointer <InputStream> in (inputSource->createInputStream());
ScopedPointer<InputStream> in (inputSource->createInputStream());
if (in != nullptr)
{
MemoryOutputStream data;
data.writeFromInputStream (*in, onlyReadOuterDocumentElement ? 8192 : -1);
textToParse = data.toString();
if (! onlyReadOuterDocumentElement)
originalText = textToParse;
#if BEAST_STRING_UTF_TYPE == 8
if (data.getDataSize() > 2)
{
data.writeByte (0);
const char* text = static_cast<const char*> (data.getData());
if (CharPointer_UTF16::isByteOrderMarkBigEndian (text)
|| CharPointer_UTF16::isByteOrderMarkLittleEndian (text))
{
originalText = data.toString();
}
else
{
if (CharPointer_UTF8::isByteOrderMark (text))
text += 3;
// parse the input buffer directly to avoid copying it all to a string..
return parseDocumentElement (String::CharPointerType (text), onlyReadOuterDocumentElement);
}
}
#else
originalText = data.toString();
#endif
}
}
input = textToParse.getCharPointer();
lastError = String::empty;
errorOccurred = false;
outOfData = false;
needToLoadDTD = true;
if (textToParse.isEmpty())
{
lastError = "not enough input";
}
else
{
skipHeader();
if (input.getAddress() != nullptr)
{
ScopedPointer <XmlElement> result (readNextElement (! onlyReadOuterDocumentElement));
if (! errorOccurred)
return result.release();
}
else
{
lastError = "incorrect xml header";
}
}
return nullptr;
return parseDocumentElement (originalText.getCharPointer(), onlyReadOuterDocumentElement);
}
const String& XmlDocument::getLastParseError() const noexcept
@@ -156,7 +163,7 @@ String XmlDocument::getFileContents (const String& filename) const
{
if (inputSource != nullptr)
{
const ScopedPointer <InputStream> in (inputSource->createInputStreamFor (filename.trim().unquoted()));
const ScopedPointer<InputStream> in (inputSource->createInputStreamFor (filename.trim().unquoted()));
if (in != nullptr)
return in->readEntireStreamAsString();
@@ -178,33 +185,56 @@ beast_wchar XmlDocument::readNextChar() noexcept
return c;
}
int XmlDocument::findNextTokenLength() noexcept
XmlElement* XmlDocument::parseDocumentElement (String::CharPointerType textToParse,
const bool onlyReadOuterDocumentElement)
{
int len = 0;
beast_wchar c = *input;
input = textToParse;
errorOccurred = false;
outOfData = false;
needToLoadDTD = true;
while (XmlIdentifierChars::isIdentifierChar (c))
c = input [++len];
if (textToParse.isEmpty())
{
lastError = "not enough input";
}
else if (! parseHeader())
{
lastError = "malformed header";
}
else if (! parseDTD())
{
lastError = "malformed DTD";
}
else
{
lastError = String::empty;
return len;
ScopedPointer<XmlElement> result (readNextElement (! onlyReadOuterDocumentElement));
if (! errorOccurred)
return result.release();
}
return nullptr;
}
void XmlDocument::skipHeader()
bool XmlDocument::parseHeader()
{
const int headerStart = input.indexOf (CharPointer_UTF8 ("<?xml"));
skipNextWhiteSpace();
if (headerStart >= 0)
if (CharacterFunctions::compareUpTo (input, CharPointer_ASCII ("<?xml"), 5) == 0)
{
const int headerEnd = (input + headerStart).indexOf (CharPointer_UTF8 ("?>"));
if (headerEnd < 0)
return;
const String::CharPointerType headerEnd (CharacterFunctions::find (input, CharPointer_ASCII ("?>")));
if (headerEnd.isEmpty())
return false;
#if BEAST_DEBUG
const String header (input + headerStart, (size_t) (headerEnd - headerStart));
const String encoding (header.fromFirstOccurrenceOf ("encoding", false, true)
.fromFirstOccurrenceOf ("=", false, false)
.fromFirstOccurrenceOf ("\"", false, false)
.upToFirstOccurrenceOf ("\"", false, false).trim());
const String encoding (String (input, headerEnd)
.fromFirstOccurrenceOf ("encoding", false, true)
.fromFirstOccurrenceOf ("=", false, false)
.fromFirstOccurrenceOf ("\"", false, false)
.upToFirstOccurrenceOf ("\"", false, false).trim());
/* If you load an XML document with a non-UTF encoding type, it may have been
loaded wrongly.. Since all the files are read via the normal beast file streams,
@@ -216,58 +246,59 @@ void XmlDocument::skipHeader()
bassert (encoding.isEmpty() || encoding.startsWithIgnoreCase ("utf-"));
#endif
input += headerEnd + 2;
input = headerEnd + 2;
skipNextWhiteSpace();
}
skipNextWhiteSpace();
return true;
}
const int docTypeIndex = input.indexOf (CharPointer_UTF8 ("<!DOCTYPE"));
if (docTypeIndex < 0)
return;
input += docTypeIndex + 9;
const String::CharPointerType docType (input);
int n = 1;
while (n > 0)
bool XmlDocument::parseDTD()
{
if (CharacterFunctions::compareUpTo (input, CharPointer_ASCII ("<!DOCTYPE"), 9) == 0)
{
const beast_wchar c = readNextChar();
input += 9;
const String::CharPointerType dtdStart (input);
if (outOfData)
return;
for (int n = 1; n > 0;)
{
const beast_wchar c = readNextChar();
if (c == '<')
++n;
else if (c == '>')
--n;
if (outOfData)
return false;
if (c == '<')
++n;
else if (c == '>')
--n;
}
dtdText = String (dtdStart, input - 1).trim();
}
dtdText = String (docType, (size_t) (input.getAddress() - (docType.getAddress() + 1))).trim();
return true;
}
void XmlDocument::skipNextWhiteSpace()
{
for (;;)
{
beast_wchar c = *input;
input = input.findEndOfWhitespace();
while (CharacterFunctions::isWhitespace (c))
c = *++input;
if (c == 0)
if (input.isEmpty())
{
outOfData = true;
break;
}
else if (c == '<')
if (*input == '<')
{
if (input[1] == '!'
&& input[2] == '-'
&& input[3] == '-')
{
input += 4;
const int closeComment = input.indexOf (CharPointer_UTF8 ("-->"));
const int closeComment = input.indexOf (CharPointer_ASCII ("-->"));
if (closeComment < 0)
{
@@ -278,10 +309,11 @@ void XmlDocument::skipNextWhiteSpace()
input += closeComment + 3;
continue;
}
else if (input[1] == '?')
if (input[1] == '?')
{
input += 2;
const int closeBracket = input.indexOf (CharPointer_UTF8 ("?>"));
const int closeBracket = input.indexOf (CharPointer_ASCII ("?>"));
if (closeBracket < 0)
{
@@ -318,7 +350,6 @@ void XmlDocument::readQuotedString (String& result)
else
{
const String::CharPointerType start (input);
size_t numChars = 0;
for (;;)
{
@@ -326,13 +357,13 @@ void XmlDocument::readQuotedString (String& result)
if (character == quote)
{
result.appendCharPointer (start, numChars);
result.appendCharPointer (start, input);
++input;
return;
}
else if (character == '&')
{
result.appendCharPointer (start, numChars);
result.appendCharPointer (start, input);
break;
}
else if (character == 0)
@@ -343,7 +374,6 @@ void XmlDocument::readQuotedString (String& result)
}
++input;
++numChars;
}
}
}
@@ -357,28 +387,26 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
if (outOfData)
return nullptr;
const int openBracket = input.indexOf ((beast_wchar) '<');
if (openBracket >= 0)
if (*input == '<')
{
input += openBracket + 1;
int tagLen = findNextTokenLength();
++input;
String::CharPointerType endOfToken (XmlIdentifierChars::findEndOfToken (input));
if (tagLen == 0)
if (endOfToken == input)
{
// no tag name - but allow for a gap after the '<' before giving an error
skipNextWhiteSpace();
tagLen = findNextTokenLength();
endOfToken = XmlIdentifierChars::findEndOfToken (input);
if (tagLen == 0)
if (endOfToken == input)
{
setLastError ("tag name missing", false);
return node;
}
}
node = new XmlElement (String (input, (size_t) tagLen));
input += tagLen;
node = new XmlElement (String (input, endOfToken));
input = endOfToken;
LinkedListPointer<XmlElement::XmlAttributeNode>::Appender attributeAppender (node->attributes);
// look for attributes
@@ -409,12 +437,12 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
// get an attribute..
if (XmlIdentifierChars::isIdentifierChar (c))
{
const int attNameLen = findNextTokenLength();
String::CharPointerType attNameEnd (XmlIdentifierChars::findEndOfToken (input));
if (attNameLen > 0)
if (attNameEnd != input)
{
const String::CharPointerType attNameStart (input);
input += attNameLen;
input = attNameEnd;
skipNextWhiteSpace();
@@ -427,7 +455,7 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
if (nextChar == '"' || nextChar == '\'')
{
XmlElement::XmlAttributeNode* const newAtt
= new XmlElement::XmlAttributeNode (String (attNameStart, (size_t) attNameLen),
= new XmlElement::XmlAttributeNode (String (attNameStart, attNameEnd),
String::empty);
readQuotedString (newAtt->value);
@@ -435,6 +463,12 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
continue;
}
}
else
{
setLastError ("expected '=' after attribute '"
+ String (attNameStart, attNameEnd) + "'", false);
return node;
}
}
}
else
@@ -467,7 +501,9 @@ void XmlDocument::readChildElements (XmlElement* parent)
if (*input == '<')
{
if (input[1] == '/')
const beast_wchar c1 = input[1];
if (c1 == '/')
{
// our close tag..
const int closeTag = input.indexOf ((beast_wchar) '>');
@@ -477,41 +513,33 @@ void XmlDocument::readChildElements (XmlElement* parent)
break;
}
else if (input[1] == '!'
&& input[2] == '['
&& input[3] == 'C'
&& input[4] == 'D'
&& input[5] == 'A'
&& input[6] == 'T'
&& input[7] == 'A'
&& input[8] == '[')
if (c1 == '!' && CharacterFunctions::compareUpTo (input + 2, CharPointer_ASCII ("[CDATA["), 7) == 0)
{
input += 9;
const String::CharPointerType inputStart (input);
size_t len = 0;
for (;;)
{
if (*input == 0)
const beast_wchar c0 = *input;
if (c0 == 0)
{
setLastError ("unterminated CDATA section", false);
outOfData = true;
break;
}
else if (input[0] == ']'
else if (c0 == ']'
&& input[1] == ']'
&& input[2] == '>')
{
childAppender.append (XmlElement::createTextElement (String (inputStart, input)));
input += 3;
break;
}
++input;
++len;
}
childAppender.append (XmlElement::createTextElement (String (inputStart, len)));
}
else
{
@@ -522,7 +550,7 @@ void XmlDocument::readChildElements (XmlElement* parent)
break;
}
}
else // must be a character block
else // must be a character block
{
input = preWhitespaceInput; // roll back to include the leading whitespace
String textElementContent;
@@ -575,17 +603,15 @@ void XmlDocument::readChildElements (XmlElement* parent)
else
{
const String::CharPointerType start (input);
size_t len = 0;
for (;;)
{
const beast_wchar nextChar = *input;
if (nextChar == '<' || nextChar == '&')
{
break;
}
else if (nextChar == 0)
if (nextChar == 0)
{
setLastError ("unmatched tags", false);
outOfData = true;
@@ -593,17 +619,14 @@ void XmlDocument::readChildElements (XmlElement* parent)
}
++input;
++len;
}
textElementContent.appendCharPointer (start, len);
textElementContent.appendCharPointer (start, input);
}
}
if ((! ignoreEmptyTextElements) || textElementContent.containsNonWhitespaceChars())
{
childAppender.append (XmlElement::createTextElement (textElementContent));
}
}
}
}
@@ -613,27 +636,27 @@ void XmlDocument::readEntity (String& result)
// skip over the ampersand
++input;
if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("amp;"), 4) == 0)
if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("amp;"), 4) == 0)
{
input += 4;
result += '&';
}
else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("quot;"), 5) == 0)
else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("quot;"), 5) == 0)
{
input += 5;
result += '"';
}
else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("apos;"), 5) == 0)
else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("apos;"), 5) == 0)
{
input += 5;
result += '\'';
}
else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("lt;"), 3) == 0)
else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("lt;"), 3) == 0)
{
input += 3;
result += '<';
}
else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("gt;"), 3) == 0)
else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("gt;"), 3) == 0)
{
input += 3;
result += '>';
@@ -712,11 +735,11 @@ void XmlDocument::readEntity (String& result)
String XmlDocument::expandEntity (const String& ent)
{
if (ent.equalsIgnoreCase ("amp")) return String::charToString ('&');
if (ent.equalsIgnoreCase ("quot")) return String::charToString ('"');
if (ent.equalsIgnoreCase ("apos")) return String::charToString ('\'');
if (ent.equalsIgnoreCase ("lt")) return String::charToString ('<');
if (ent.equalsIgnoreCase ("gt")) return String::charToString ('>');
if (ent.equalsIgnoreCase ("amp")) return String::charToString ('&');
if (ent.equalsIgnoreCase ("quot")) return String::charToString ('"');
if (ent.equalsIgnoreCase ("apos")) return String::charToString ('\'');
if (ent.equalsIgnoreCase ("lt")) return String::charToString ('<');
if (ent.equalsIgnoreCase ("gt")) return String::charToString ('>');
if (ent[0] == '#')
{
@@ -845,4 +868,4 @@ String XmlDocument::getParameterEntity (const String& entity)
}
return entity;
}
}