BE THE CODER - org/jsoup/safety/Cleaner.java


package org.jsoup.safety;



import org.jsoup.helper.Validate;

import org.jsoup.nodes.*;

import org.jsoup.parser.Tag;



import java.util.List;



/**

 The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes

 that you are expecting; no junk, and no cross-site scripting attacks!

 <p/>

 The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain

 HTML that is allowed by the whitelist.

 <p/>

 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the

 canned white-lists only allow body contained tags.

 <p/>

 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.

 */

public class Cleaner {

    private Whitelist whitelist;



    /**

     Create a new cleaner, that sanitizes documents using the supplied whitelist.

     @param whitelist white-list to clean with

     */

    public Cleaner(Whitelist whitelist) {

        Validate.notNull(whitelist);

        this.whitelist = whitelist;

    }



    /**

     Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.

     The original document is not modified. Only elements from the dirt document's <code>body</code> are used.

     @param dirtyDocument Untrusted base document to clean.

     @return cleaned document.

     */

    public Document clean(Document dirtyDocument) {

        Validate.notNull(dirtyDocument);



        Document clean = Document.createShell(dirtyDocument.baseUri());

        copySafeNodes(dirtyDocument.body(), clean.body());



        return clean;

    }



    /**

     Dertmines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes

     in the input HTML are allowed by the whitelist.

     <p/>

     This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully

     using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document

     to ensure enforced attributes are set correctly, and that the output is tidied.

     @param dirtyDocument document to test

     @return true if no tags or attributes need to be removed; false if they do

     */

    public boolean isValid(Document dirtyDocument) {

        Validate.notNull(dirtyDocument);



        Document clean = Document.createShell(dirtyDocument.baseUri());

        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());

        return numDiscarded == 0;

    }



    /**

     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.

     @param source source of HTML

     @param dest destination element to copy into

     @return number of discarded elements (that were considered unsafe)

     */

    private int copySafeNodes(Element source, Element dest) {

        List<Node> sourceChildren = source.childNodes();

        int numDiscarded = 0;



        for (Node sourceChild : sourceChildren) {

            if (sourceChild instanceof Element) {

                Element sourceEl = (Element) sourceChild;



                if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs

                    ElementMeta meta = createSafeElement(sourceEl);

                    Element destChild = meta.el;

                    dest.appendChild(destChild);



                    numDiscarded += meta.numAttribsDiscarded;

                    numDiscarded += copySafeNodes(sourceEl, destChild); // recurs

                } else { // not a safe tag, but it may have children (els or text) that are, so recurse

                    numDiscarded++;

                    numDiscarded += copySafeNodes(sourceEl, dest);

                }

            } else if (sourceChild instanceof TextNode) {

                TextNode sourceText = (TextNode) sourceChild;

                TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri());

                dest.appendChild(destText);

            } // else, we don't care about comments, xml proc instructions, etc

        }

        return numDiscarded;

    }



    private ElementMeta createSafeElement(Element sourceEl) {

        String sourceTag = sourceEl.tagName();

        Attributes destAttrs = new Attributes();

        Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);

        int numDiscarded = 0;



        Attributes sourceAttrs = sourceEl.attributes();

        for (Attribute sourceAttr : sourceAttrs) {

            if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))

                destAttrs.put(sourceAttr);

            else

                numDiscarded++;

        }

        Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);

        destAttrs.addAll(enforcedAttrs);



        return new ElementMeta(dest, numDiscarded);

    }



    private static class ElementMeta {

        Element el;

        int numAttribsDiscarded;



        ElementMeta(Element el, int numAttribsDiscarded) {

            this.el = el;

            this.numAttribsDiscarded = numAttribsDiscarded;

        }

    }



}
Open Source Repository
Home	/web/jsoup-1.6.1 \| Repository Home
Open Source Repository