General beast update, fixes, optimizations, features:

* Clean ups, optimizations, and new File::commonDocumentsDirectory enum * Replace sortArray with std::sort for performance * More error tolerance in XML parser, speedups * Refactor some byte-order mark detection code * Add String::appendCharPointer overloads * More XML parser optimisations and better error detection * Misc performance tweaks * Fixes for support of non utf8 strings * Increased precision when storing strings in XmlElement * Minor clean-ups * Minor fix to XmlDocument * Cleanups to CriticalSection and related synchronization primitives * Fix DynamicArray unit test
2025-12-06 17:27:55 +00:00 · 2013-09-10 09:52:24 -07:00
parent 27307fca0c
commit 43e6d345e4
83 changed files with 433 additions and 687 deletions
--- a/modules/beast_core/xml/beast_XmlDocument.cpp
+++ b/modules/beast_core/xml/beast_XmlDocument.cpp
@@ -24,12 +24,18 @@
 XmlDocument::XmlDocument (const String& documentText)
    : originalText (documentText),
      input (nullptr),
+      outOfData (false),
+      errorOccurred (false),
+      needToLoadDTD (false),
      ignoreEmptyTextElements (true)
 {
 }

 XmlDocument::XmlDocument (const File& file)
    : input (nullptr),
+      outOfData (false),
+      errorOccurred (false),
+      needToLoadDTD (false),
      ignoreEmptyTextElements (true),
      inputSource (new FileInputSource (file))
 {
@@ -77,68 +83,69 @@ namespace XmlIdentifierChars
                                                                      : isIdentifierCharSlow (c);
    }

-    /*static void generateIdentifierCharConstants()
+    /*
+    static void generateIdentifierCharConstants()
    {
        uint32 n[8] = { 0 };
        for (int i = 0; i < 256; ++i)
-            if (isIdentifierCharSlow (i))
-                n[i >> 5] |= (1 << (i & 31));
+        if (isIdentifierCharSlow (i))
+            n[i >> 5] |= (1 << (i & 31));

        String s;
        for (int i = 0; i < 8; ++i)
            s << "0x" << String::toHexString ((int) n[i]) << ", ";

        DBG (s);
-    }*/
+    }
+    */
+
+    static String::CharPointerType findEndOfToken (String::CharPointerType p)
+    {
+        while (isIdentifierChar (*p))
+            ++p;
+
+        return p;
+    }
 }

 XmlElement* XmlDocument::getDocumentElement (const bool onlyReadOuterDocumentElement)
 {
-    String textToParse (originalText);
-
-    if (textToParse.isEmpty() && inputSource != nullptr)
+    if (originalText.isEmpty() && inputSource != nullptr)
    {
-        ScopedPointer <InputStream> in (inputSource->createInputStream());
+        ScopedPointer<InputStream> in (inputSource->createInputStream());

        if (in != nullptr)
        {
            MemoryOutputStream data;
            data.writeFromInputStream (*in, onlyReadOuterDocumentElement ? 8192 : -1);
-            textToParse = data.toString();

-            if (! onlyReadOuterDocumentElement)
-                originalText = textToParse;
+           #if BEAST_STRING_UTF_TYPE == 8
+            if (data.getDataSize() > 2)
+            {
+                data.writeByte (0);
+                const char* text = static_cast<const char*> (data.getData());
+
+                if (CharPointer_UTF16::isByteOrderMarkBigEndian (text)
+                      || CharPointer_UTF16::isByteOrderMarkLittleEndian (text))
+                {
+                    originalText = data.toString();
+                }
+                else
+                {
+                    if (CharPointer_UTF8::isByteOrderMark (text))
+                        text += 3;
+
+                    // parse the input buffer directly to avoid copying it all to a string..
+                    return parseDocumentElement (String::CharPointerType (text), onlyReadOuterDocumentElement);
+                }
+            }
+           #else
+            originalText = data.toString();
+           #endif
        }
    }

-    input = textToParse.getCharPointer();
-    lastError = String::empty;
-    errorOccurred = false;
-    outOfData = false;
-    needToLoadDTD = true;
-
-    if (textToParse.isEmpty())
-    {
-        lastError = "not enough input";
-    }
-    else
-    {
-        skipHeader();
-
-        if (input.getAddress() != nullptr)
-        {
-            ScopedPointer <XmlElement> result (readNextElement (! onlyReadOuterDocumentElement));
-
-            if (! errorOccurred)
-                return result.release();
-        }
-        else
-        {
-            lastError = "incorrect xml header";
-        }
-    }
-
-    return nullptr;
+    return parseDocumentElement (originalText.getCharPointer(), onlyReadOuterDocumentElement);
 }

 const String& XmlDocument::getLastParseError() const noexcept
@@ -156,7 +163,7 @@ String XmlDocument::getFileContents (const String& filename) const
 {
    if (inputSource != nullptr)
    {
-        const ScopedPointer <InputStream> in (inputSource->createInputStreamFor (filename.trim().unquoted()));
+        const ScopedPointer<InputStream> in (inputSource->createInputStreamFor (filename.trim().unquoted()));

        if (in != nullptr)
            return in->readEntireStreamAsString();
@@ -178,33 +185,56 @@ beast_wchar XmlDocument::readNextChar() noexcept
    return c;
 }

-int XmlDocument::findNextTokenLength() noexcept
+XmlElement* XmlDocument::parseDocumentElement (String::CharPointerType textToParse,
+                                               const bool onlyReadOuterDocumentElement)
 {
-    int len = 0;
-    beast_wchar c = *input;
+    input = textToParse;
+    errorOccurred = false;
+    outOfData = false;
+    needToLoadDTD = true;

-    while (XmlIdentifierChars::isIdentifierChar (c))
-        c = input [++len];
+    if (textToParse.isEmpty())
+    {
+        lastError = "not enough input";
+    }
+    else if (! parseHeader())
+    {
+        lastError = "malformed header";
+    }
+    else if (! parseDTD())
+    {
+        lastError = "malformed DTD";
+    }
+    else
+    {
+        lastError = String::empty;

-    return len;
+        ScopedPointer<XmlElement> result (readNextElement (! onlyReadOuterDocumentElement));
+
+        if (! errorOccurred)
+            return result.release();
+    }
+
+    return nullptr;
 }

-void XmlDocument::skipHeader()
+bool XmlDocument::parseHeader()
 {
-    const int headerStart = input.indexOf (CharPointer_UTF8 ("<?xml"));
+    skipNextWhiteSpace();

-    if (headerStart >= 0)
+    if (CharacterFunctions::compareUpTo (input, CharPointer_ASCII ("<?xml"), 5) == 0)
    {
-        const int headerEnd = (input + headerStart).indexOf (CharPointer_UTF8 ("?>"));
-        if (headerEnd < 0)
-            return;
+        const String::CharPointerType headerEnd (CharacterFunctions::find (input, CharPointer_ASCII ("?>")));
+
+        if (headerEnd.isEmpty())
+            return false;

       #if BEAST_DEBUG
-        const String header (input + headerStart, (size_t) (headerEnd - headerStart));
-        const String encoding (header.fromFirstOccurrenceOf ("encoding", false, true)
-                                     .fromFirstOccurrenceOf ("=", false, false)
-                                     .fromFirstOccurrenceOf ("\"", false, false)
-                                     .upToFirstOccurrenceOf ("\"", false, false).trim());
+        const String encoding (String (input, headerEnd)
+                                 .fromFirstOccurrenceOf ("encoding", false, true)
+                                 .fromFirstOccurrenceOf ("=", false, false)
+                                 .fromFirstOccurrenceOf ("\"", false, false)
+                                 .upToFirstOccurrenceOf ("\"", false, false).trim());

        /* If you load an XML document with a non-UTF encoding type, it may have been
           loaded wrongly.. Since all the files are read via the normal beast file streams,
@@ -216,58 +246,59 @@ void XmlDocument::skipHeader()
        bassert (encoding.isEmpty() || encoding.startsWithIgnoreCase ("utf-"));
       #endif

-        input += headerEnd + 2;
+        input = headerEnd + 2;
+        skipNextWhiteSpace();
    }

-    skipNextWhiteSpace();
+    return true;
+}

-    const int docTypeIndex = input.indexOf (CharPointer_UTF8 ("<!DOCTYPE"));
-    if (docTypeIndex < 0)
-        return;
-
-    input += docTypeIndex + 9;
-    const String::CharPointerType docType (input);
-
-    int n = 1;
-
-    while (n > 0)
+bool XmlDocument::parseDTD()
+{
+    if (CharacterFunctions::compareUpTo (input, CharPointer_ASCII ("<!DOCTYPE"), 9) == 0)
    {
-        const beast_wchar c = readNextChar();
+        input += 9;
+        const String::CharPointerType dtdStart (input);

-        if (outOfData)
-            return;
+        for (int n = 1; n > 0;)
+        {
+            const beast_wchar c = readNextChar();

-        if (c == '<')
-            ++n;
-        else if (c == '>')
-            --n;
+            if (outOfData)
+                return false;
+
+            if (c == '<')
+                ++n;
+            else if (c == '>')
+                --n;
+        }
+
+        dtdText = String (dtdStart, input - 1).trim();
    }

-    dtdText = String (docType, (size_t) (input.getAddress() - (docType.getAddress() + 1))).trim();
+    return true;
 }

 void XmlDocument::skipNextWhiteSpace()
 {
    for (;;)
    {
-        beast_wchar c = *input;
+        input = input.findEndOfWhitespace();

-        while (CharacterFunctions::isWhitespace (c))
-            c = *++input;
-
-        if (c == 0)
+        if (input.isEmpty())
        {
            outOfData = true;
            break;
        }
-        else if (c == '<')
+
+        if (*input == '<')
        {
            if (input[1] == '!'
                 && input[2] == '-'
                 && input[3] == '-')
            {
                input += 4;
-                const int closeComment = input.indexOf (CharPointer_UTF8 ("-->"));
+                const int closeComment = input.indexOf (CharPointer_ASCII ("-->"));

                if (closeComment < 0)
                {
@@ -278,10 +309,11 @@ void XmlDocument::skipNextWhiteSpace()
                input += closeComment + 3;
                continue;
            }
-            else if (input[1] == '?')
+            
+            if (input[1] == '?')
            {
                input += 2;
-                const int closeBracket = input.indexOf (CharPointer_UTF8 ("?>"));
+                const int closeBracket = input.indexOf (CharPointer_ASCII ("?>"));

                if (closeBracket < 0)
                {
@@ -318,7 +350,6 @@ void XmlDocument::readQuotedString (String& result)
        else
        {
            const String::CharPointerType start (input);
-            size_t numChars = 0;

            for (;;)
            {
@@ -326,13 +357,13 @@ void XmlDocument::readQuotedString (String& result)

                if (character == quote)
                {
-                    result.appendCharPointer (start, numChars);
+                    result.appendCharPointer (start, input);
                    ++input;
                    return;
                }
                else if (character == '&')
                {
-                    result.appendCharPointer (start, numChars);
+                    result.appendCharPointer (start, input);
                    break;
                }
                else if (character == 0)
@@ -343,7 +374,6 @@ void XmlDocument::readQuotedString (String& result)
                }

                ++input;
-                ++numChars;
            }
        }
    }
@@ -357,28 +387,26 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
    if (outOfData)
        return nullptr;

-    const int openBracket = input.indexOf ((beast_wchar) '<');
-
-    if (openBracket >= 0)
+    if (*input == '<')
    {
-        input += openBracket + 1;
-        int tagLen = findNextTokenLength();
+        ++input;
+        String::CharPointerType endOfToken (XmlIdentifierChars::findEndOfToken (input));

-        if (tagLen == 0)
+        if (endOfToken == input)
        {
            // no tag name - but allow for a gap after the '<' before giving an error
            skipNextWhiteSpace();
-            tagLen = findNextTokenLength();
+            endOfToken = XmlIdentifierChars::findEndOfToken (input);

-            if (tagLen == 0)
+            if (endOfToken == input)
            {
                setLastError ("tag name missing", false);
                return node;
            }
        }

-        node = new XmlElement (String (input, (size_t) tagLen));
-        input += tagLen;
+        node = new XmlElement (String (input, endOfToken));
+        input = endOfToken;
        LinkedListPointer<XmlElement::XmlAttributeNode>::Appender attributeAppender (node->attributes);

        // look for attributes
@@ -409,12 +437,12 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
            // get an attribute..
            if (XmlIdentifierChars::isIdentifierChar (c))
            {
-                const int attNameLen = findNextTokenLength();
+                String::CharPointerType attNameEnd (XmlIdentifierChars::findEndOfToken (input));

-                if (attNameLen > 0)
+                if (attNameEnd != input)
                {
                    const String::CharPointerType attNameStart (input);
-                    input += attNameLen;
+                    input = attNameEnd;

                    skipNextWhiteSpace();

@@ -427,7 +455,7 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
                        if (nextChar == '"' || nextChar == '\'')
                        {
                            XmlElement::XmlAttributeNode* const newAtt
-                                = new XmlElement::XmlAttributeNode (String (attNameStart, (size_t) attNameLen),
+                                = new XmlElement::XmlAttributeNode (String (attNameStart, attNameEnd),
                                                                    String::empty);

                            readQuotedString (newAtt->value);
@@ -435,6 +463,12 @@ XmlElement* XmlDocument::readNextElement (const bool alsoParseSubElements)
                            continue;
                        }
                    }
+                    else
+                    {
+                        setLastError ("expected '=' after attribute '"
+                                        + String (attNameStart, attNameEnd) + "'", false);
+                        return node;
+                    }
                }
            }
            else
@@ -467,7 +501,9 @@ void XmlDocument::readChildElements (XmlElement* parent)

        if (*input == '<')
        {
-            if (input[1] == '/')
+            const beast_wchar c1 = input[1];
+
+            if (c1 == '/')
            {
                // our close tag..
                const int closeTag = input.indexOf ((beast_wchar) '>');
@@ -477,41 +513,33 @@ void XmlDocument::readChildElements (XmlElement* parent)

                break;
            }
-            else if (input[1] == '!'
-                  && input[2] == '['
-                  && input[3] == 'C'
-                  && input[4] == 'D'
-                  && input[5] == 'A'
-                  && input[6] == 'T'
-                  && input[7] == 'A'
-                  && input[8] == '[')
+            
+            if (c1 == '!' && CharacterFunctions::compareUpTo (input + 2, CharPointer_ASCII ("[CDATA["), 7) == 0) 
            {
                input += 9;
                const String::CharPointerType inputStart (input);

-                size_t len = 0;
-
                for (;;)
                {
-                    if (*input == 0)
+                    const beast_wchar c0 = *input;
+
+                    if (c0 == 0)
                    {
                        setLastError ("unterminated CDATA section", false);
                        outOfData = true;
                        break;
                    }
-                    else if (input[0] == ']'
+                    else if (c0 == ']'
                              && input[1] == ']'
                              && input[2] == '>')
                    {
+                        childAppender.append (XmlElement::createTextElement (String (inputStart, input)));
                        input += 3;
                        break;
                    }

                    ++input;
-                    ++len;
                }
-
-                childAppender.append (XmlElement::createTextElement (String (inputStart, len)));
            }
            else
            {
@@ -522,7 +550,7 @@ void XmlDocument::readChildElements (XmlElement* parent)
                    break;
            }
        }
-        else  // must be a character block
+        else // must be a character block
        {
            input = preWhitespaceInput; // roll back to include the leading whitespace
            String textElementContent;
@@ -575,17 +603,15 @@ void XmlDocument::readChildElements (XmlElement* parent)
                else
                {
                    const String::CharPointerType start (input);
-                    size_t len = 0;

                    for (;;)
                    {
                        const beast_wchar nextChar = *input;

                        if (nextChar == '<' || nextChar == '&')
-                        {
                            break;
-                        }
-                        else if (nextChar == 0)
+
+                        if (nextChar == 0)
                        {
                            setLastError ("unmatched tags", false);
                            outOfData = true;
@@ -593,17 +619,14 @@ void XmlDocument::readChildElements (XmlElement* parent)
                        }

                        ++input;
-                        ++len;
                    }

-                    textElementContent.appendCharPointer (start, len);
+                    textElementContent.appendCharPointer (start, input);
                }
            }

            if ((! ignoreEmptyTextElements) || textElementContent.containsNonWhitespaceChars())
-            {
                childAppender.append (XmlElement::createTextElement (textElementContent));
-            }
        }
    }
 }
@@ -613,27 +636,27 @@ void XmlDocument::readEntity (String& result)
    // skip over the ampersand
    ++input;

-    if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("amp;"), 4) == 0)
+    if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("amp;"), 4) == 0)
    {
        input += 4;
        result += '&';
    }
-    else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("quot;"), 5) == 0)
+    else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("quot;"), 5) == 0)
    {
        input += 5;
        result += '"';
    }
-    else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("apos;"), 5) == 0)
+    else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("apos;"), 5) == 0)
    {
        input += 5;
        result += '\'';
    }
-    else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("lt;"), 3) == 0)
+    else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("lt;"), 3) == 0)
    {
        input += 3;
        result += '<';
    }
-    else if (input.compareIgnoreCaseUpTo (CharPointer_UTF8 ("gt;"), 3) == 0)
+    else if (input.compareIgnoreCaseUpTo (CharPointer_ASCII ("gt;"), 3) == 0)
    {
        input += 3;
        result += '>';
@@ -712,11 +735,11 @@ void XmlDocument::readEntity (String& result)

 String XmlDocument::expandEntity (const String& ent)
 {
-    if (ent.equalsIgnoreCase ("amp"))   return String::charToString ('&');
-    if (ent.equalsIgnoreCase ("quot"))  return String::charToString ('"');
-    if (ent.equalsIgnoreCase ("apos"))  return String::charToString ('\'');
-    if (ent.equalsIgnoreCase ("lt"))    return String::charToString ('<');
-    if (ent.equalsIgnoreCase ("gt"))    return String::charToString ('>');
+    if (ent.equalsIgnoreCase ("amp")) return String::charToString ('&');
+    if (ent.equalsIgnoreCase ("quot")) return String::charToString ('"');
+    if (ent.equalsIgnoreCase ("apos")) return String::charToString ('\'');
+    if (ent.equalsIgnoreCase ("lt")) return String::charToString ('<');
+    if (ent.equalsIgnoreCase ("gt")) return String::charToString ('>');

    if (ent[0] == '#')
    {
@@ -845,4 +868,4 @@ String XmlDocument::getParameterEntity (const String& entity)
    }

    return entity;
-}
+}