001package org.jsoup.safety;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.ParseErrorList;
012import org.jsoup.parser.Parser;
013import org.jsoup.select.NodeTraversor;
014import org.jsoup.select.NodeVisitor;
015
016import java.util.List;
017
018import static org.jsoup.internal.SharedConstants.DummyUri;
019
020/**
021 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
022 that you are expecting; no junk, and no cross-site scripting attacks!
023 <p>
024 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
025 HTML that is allowed by the safelist.
026 </p>
027 <p>
028 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
029 canned safe-lists only allow body contained tags.
030 </p>
031 <p>
032 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
033 </p>
034 */
035public class Cleaner {
036    private final Safelist safelist;
037
038    /**
039     Create a new cleaner, that sanitizes documents using the supplied safelist.
040     @param safelist safe-list to clean with
041     */
042    public Cleaner(Safelist safelist) {
043        Validate.notNull(safelist);
044        this.safelist = safelist;
045    }
046
047    /**
048     Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist.
049     The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The
050     OutputSettings of the original document are cloned into the clean document.
051     @param dirtyDocument Untrusted base document to clean.
052     @return cleaned document.
053     */
054    public Document clean(Document dirtyDocument) {
055        Validate.notNull(dirtyDocument);
056
057        Document clean = Document.createShell(dirtyDocument.baseUri());
058        copySafeNodes(dirtyDocument.body(), clean.body());
059        clean.outputSettings(dirtyDocument.outputSettings().clone());
060
061        return clean;
062    }
063
064    /**
065     Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the
066     tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the
067     <code>head</code>.
068     <p>
069     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
070     output of this method, the input document <b>must always</b> be normalized using a method such as
071     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
072     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
073     differences between how a given browser and how jsoup parses the input HTML are normalized.
074     </p>
075     <p>Example:
076     <pre>{@code
077     Document inputDoc = Jsoup.parse(inputHtml);
078     Cleaner cleaner = new Cleaner(Safelist.relaxed());
079     boolean isValid = cleaner.isValid(inputDoc);
080     Document normalizedDoc = cleaner.clean(inputDoc);
081     }</pre>
082     </p>
083     @param dirtyDocument document to test
084     @return true if no tags or attributes need to be removed; false if they do
085     */
086    public boolean isValid(Document dirtyDocument) {
087        Validate.notNull(dirtyDocument);
088
089        Document clean = Document.createShell(dirtyDocument.baseUri());
090        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
091        return numDiscarded == 0
092            && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head
093    }
094
095    /**
096     Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all
097     the tags and attributes in the input HTML are allowed by the safelist.
098     <p>
099     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
100     output of this method, the input document <b>must always</b> be normalized using a method such as
101     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
102     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
103     differences between how a given browser and how jsoup parses the input HTML are normalized.
104     </p>
105     <p>Example:
106     <pre>{@code
107     Document inputDoc = Jsoup.parse(inputHtml);
108     Cleaner cleaner = new Cleaner(Safelist.relaxed());
109     boolean isValid = cleaner.isValidBodyHtml(inputHtml);
110     Document normalizedDoc = cleaner.clean(inputDoc);
111     }</pre>
112     </p>
113     @param bodyHtml HTML fragment to test
114     @return true if no tags or attributes need to be removed; false if they do
115     */
116    public boolean isValidBodyHtml(String bodyHtml) {
117        String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid
118        Document clean = Document.createShell(baseUri);
119        Document dirty = Document.createShell(baseUri);
120        ParseErrorList errorList = ParseErrorList.tracking(1);
121        List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList);
122        dirty.body().insertChildren(0, nodes);
123        int numDiscarded = copySafeNodes(dirty.body(), clean.body());
124        return numDiscarded == 0 && errorList.isEmpty();
125    }
126
127    /**
128     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
129     */
130    private final class CleaningVisitor implements NodeVisitor {
131        private int numDiscarded = 0;
132        private final Element root;
133        private Element destination; // current element to append nodes to
134
135        private CleaningVisitor(Element root, Element destination) {
136            this.root = root;
137            this.destination = destination;
138        }
139
140        @Override public void head(Node source, int depth) {
141            if (source instanceof Element) {
142                Element sourceEl = (Element) source;
143
144                if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs
145                    ElementMeta meta = createSafeElement(sourceEl);
146                    Element destChild = meta.el;
147                    destination.appendChild(destChild);
148
149                    numDiscarded += meta.numAttribsDiscarded;
150                    destination = destChild;
151                } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
152                    numDiscarded++;
153                }
154            } else if (source instanceof TextNode) {
155                TextNode sourceText = (TextNode) source;
156                TextNode destText = new TextNode(sourceText.getWholeText());
157                destination.appendChild(destText);
158            } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) {
159              DataNode sourceData = (DataNode) source;
160              DataNode destData = new DataNode(sourceData.getWholeData());
161              destination.appendChild(destData);
162            } else { // else, we don't care about comments, xml proc instructions, etc
163                numDiscarded++;
164            }
165        }
166
167        @Override public void tail(Node source, int depth) {
168            if (source instanceof Element && safelist.isSafeTag(source.normalName())) {
169                destination = destination.parent(); // would have descended, so pop destination stack
170            }
171        }
172    }
173
174    private int copySafeNodes(Element source, Element dest) {
175        CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
176        NodeTraversor.traverse(cleaningVisitor, source);
177        return cleaningVisitor.numDiscarded;
178    }
179
180    private ElementMeta createSafeElement(Element sourceEl) {
181        Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data
182        String sourceTag = sourceEl.tagName();
183        Attributes destAttrs = dest.attributes();
184        dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy
185
186        int numDiscarded = 0;
187        Attributes sourceAttrs = sourceEl.attributes();
188        for (Attribute sourceAttr : sourceAttrs) {
189            if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
190                destAttrs.put(sourceAttr);
191            else
192                numDiscarded++;
193        }
194
195
196        Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag);
197        // special case for <a href rel=nofollow>, only apply to external links:
198        if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) {
199            String href = sourceEl.absUrl("href");
200            String sourceBase = sourceEl.baseUri();
201            if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow
202                enforcedAttrs.remove("rel");
203            }
204        }
205
206        destAttrs.addAll(enforcedAttrs);
207        dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
208        return new ElementMeta(dest, numDiscarded);
209    }
210
211    private static class ElementMeta {
212        Element el;
213        int numAttribsDiscarded;
214
215        ElementMeta(Element el, int numAttribsDiscarded) {
216            this.el = el;
217            this.numAttribsDiscarded = numAttribsDiscarded;
218        }
219    }
220
221}