001/****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one   *
003 * or more contributor license agreements.  See the NOTICE file *
004 * distributed with this work for additional information        *
005 * regarding copyright ownership.  The ASF licenses this file   *
006 * to you under the Apache License, Version 2.0 (the            *
007 * "License"); you may not use this file except in compliance   *
008 * with the License.  You may obtain a copy of the License at   *
009 *                                                              *
010 *   http://www.apache.org/licenses/LICENSE-2.0                 *
011 *                                                              *
012 * Unless required by applicable law or agreed to in writing,   *
013 * software distributed under the License is distributed on an  *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015 * KIND, either express or implied.  See the License for the    *
016 * specific language governing permissions and limitations      *
017 * under the License.                                           *
018 ****************************************************************/
019
020package org.apache.james.mime4j.codec;
021
022import java.io.ByteArrayInputStream;
023import java.io.ByteArrayOutputStream;
024import java.io.IOException;
025import java.io.UnsupportedEncodingException;
026import java.nio.charset.Charset;
027import java.util.regex.Matcher;
028import java.util.regex.Pattern;
029
030import org.apache.james.mime4j.util.CharsetUtil;
031
032/**
033 * Static methods for decoding strings, byte arrays and encoded words.
034 */
035public class DecoderUtil {
036
037    private static final Pattern PATTERN_ENCODED_WORD = Pattern.compile(
038            "(.*?)=\\?(.+?)\\?(\\w)\\?(.+?)\\?=", Pattern.DOTALL);
039
040    /**
041     * Decodes a string containing quoted-printable encoded data.
042     *
043     * @param s the string to decode.
044     * @return the decoded bytes.
045     */
046    private static byte[] decodeQuotedPrintable(String s, DecodeMonitor monitor) {
047        ByteArrayOutputStream baos = new ByteArrayOutputStream();
048
049        try {
050            byte[] bytes = s.getBytes("US-ASCII");
051
052            QuotedPrintableInputStream is = new QuotedPrintableInputStream(
053                                               new ByteArrayInputStream(bytes), monitor);
054
055            int b = 0;
056            while ((b = is.read()) != -1) {
057                baos.write(b);
058            }
059        } catch (IOException e) {
060            // This should never happen!
061            throw new IllegalStateException(e);
062        }
063
064        return baos.toByteArray();
065    }
066
067    /**
068     * Decodes a string containing base64 encoded data.
069     *
070     * @param s the string to decode.
071     * @param monitor
072     * @return the decoded bytes.
073     */
074    private static byte[] decodeBase64(String s, DecodeMonitor monitor) {
075        ByteArrayOutputStream baos = new ByteArrayOutputStream();
076
077        try {
078            byte[] bytes = s.getBytes("US-ASCII");
079
080            Base64InputStream is = new Base64InputStream(
081                                        new ByteArrayInputStream(bytes), monitor);
082
083            int b = 0;
084            while ((b = is.read()) != -1) {
085                baos.write(b);
086            }
087        } catch (IOException e) {
088            // This should never happen!
089            throw new IllegalStateException(e);
090        }
091
092        return baos.toByteArray();
093    }
094
095    /**
096     * Decodes an encoded text encoded with the 'B' encoding (described in
097     * RFC 2047) found in a header field body.
098     *
099     * @param encodedText the encoded text to decode.
100     * @param charset the Java charset to use.
101     * @param monitor
102     * @return the decoded string.
103     * @throws UnsupportedEncodingException if the given Java charset isn't
104     *         supported.
105     */
106    static String decodeB(String encodedText, String charset, DecodeMonitor monitor)
107            throws UnsupportedEncodingException {
108        byte[] decodedBytes = decodeBase64(encodedText, monitor);
109        return new String(decodedBytes, charset);
110    }
111
112    /**
113     * Decodes an encoded text encoded with the 'Q' encoding (described in
114     * RFC 2047) found in a header field body.
115     *
116     * @param encodedText the encoded text to decode.
117     * @param charset the Java charset to use.
118     * @return the decoded string.
119     * @throws UnsupportedEncodingException if the given Java charset isn't
120     *         supported.
121     */
122    static String decodeQ(String encodedText, String charset, DecodeMonitor monitor)
123            throws UnsupportedEncodingException {
124        encodedText = replaceUnderscores(encodedText);
125
126        byte[] decodedBytes = decodeQuotedPrintable(encodedText, monitor);
127        return new String(decodedBytes, charset);
128    }
129
130    static String decodeEncodedWords(String body)  {
131        return decodeEncodedWords(body, DecodeMonitor.SILENT);
132    }
133
134    /**
135     * Decodes a string containing encoded words as defined by RFC 2047. Encoded
136     * words have the form =?charset?enc?encoded-text?= where enc is either 'Q'
137     * or 'q' for quoted-printable and 'B' or 'b' for base64.
138     *
139     * @param body the string to decode
140     * @param monitor the DecodeMonitor to be used.
141     * @return the decoded string.
142     * @throws IllegalArgumentException only if the DecodeMonitor strategy throws it (Strict parsing)
143     */
144    public static String decodeEncodedWords(String body, DecodeMonitor monitor) throws IllegalArgumentException {
145        int tailIndex = 0;
146        boolean lastMatchValid = false;
147
148        StringBuilder sb = new StringBuilder();
149
150        for (Matcher matcher = PATTERN_ENCODED_WORD.matcher(body); matcher.find();) {
151            String separator = matcher.group(1);
152            String mimeCharset = matcher.group(2);
153            String encoding = matcher.group(3);
154            String encodedText = matcher.group(4);
155
156            String decoded = null;
157            decoded = tryDecodeEncodedWord(mimeCharset, encoding, encodedText, monitor);
158            if (decoded == null) {
159                sb.append(matcher.group(0));
160            } else {
161                if (!lastMatchValid || !CharsetUtil.isWhitespace(separator)) {
162                    sb.append(separator);
163                }
164                sb.append(decoded);
165            }
166
167            tailIndex = matcher.end();
168            lastMatchValid = decoded != null;
169        }
170
171        if (tailIndex == 0) {
172            return body;
173        } else {
174            sb.append(body.substring(tailIndex));
175            return sb.toString();
176        }
177    }
178
179    // return null on error
180    private static String tryDecodeEncodedWord(final String mimeCharset,
181            final String encoding, final String encodedText, final DecodeMonitor monitor) {
182        Charset charset = CharsetUtil.lookup(mimeCharset);
183        if (charset == null) {
184            monitor(monitor, mimeCharset, encoding, encodedText, "leaving word encoded",
185                    "Mime charser '", mimeCharset, "' doesn't have a corresponding Java charset");
186            return null;
187        }
188
189        if (encodedText.length() == 0) {
190            monitor(monitor, mimeCharset, encoding, encodedText, "leaving word encoded",
191                    "Missing encoded text in encoded word");
192            return null;
193        }
194
195        try {
196            if (encoding.equalsIgnoreCase("Q")) {
197                return DecoderUtil.decodeQ(encodedText, charset.name(), monitor);
198            } else if (encoding.equalsIgnoreCase("B")) {
199                return DecoderUtil.decodeB(encodedText, charset.name(), monitor);
200            } else {
201                monitor(monitor, mimeCharset, encoding, encodedText, "leaving word encoded",
202                        "Warning: Unknown encoding in encoded word");
203                return null;
204            }
205        } catch (UnsupportedEncodingException e) {
206            // should not happen because of isDecodingSupported check above
207            monitor(monitor, mimeCharset, encoding, encodedText, "leaving word encoded",
208                    "Unsupported encoding (", e.getMessage(), ") in encoded word");
209            return null;
210        } catch (RuntimeException e) {
211            monitor(monitor, mimeCharset, encoding, encodedText, "leaving word encoded",
212                    "Could not decode (", e.getMessage(), ") encoded word");
213            return null;
214        }
215    }
216
217    private static void monitor(DecodeMonitor monitor, String mimeCharset, String encoding,
218            String encodedText, String dropDesc, String... strings) throws IllegalArgumentException {
219        if (monitor.isListening()) {
220            String encodedWord = recombine(mimeCharset, encoding, encodedText);
221            StringBuilder text = new StringBuilder();
222            for (String str : strings) {
223                text.append(str);
224            }
225            text.append(" (");
226            text.append(encodedWord);
227            text.append(")");
228            String exceptionDesc = text.toString();
229            if (monitor.warn(exceptionDesc, dropDesc))
230                throw new IllegalArgumentException(text.toString());
231        }
232    }
233
234    private static String recombine(final String mimeCharset,
235            final String encoding, final String encodedText) {
236        return "=?" + mimeCharset + "?" + encoding + "?" + encodedText + "?=";
237    }
238
239    // Replace _ with =20
240    private static String replaceUnderscores(String str) {
241        // probably faster than String#replace(CharSequence, CharSequence)
242
243        StringBuilder sb = new StringBuilder(128);
244
245        for (int i = 0; i < str.length(); i++) {
246            char c = str.charAt(i);
247            if (c == '_') {
248                sb.append("=20");
249            } else {
250                sb.append(c);
251            }
252        }
253
254        return sb.toString();
255    }
256}