001    /*
002     * Copyright 2003-2008 the original author or authors.
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *     http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     *
016     * You are receiving this code free of charge, which represents many hours of
017     * effort from other individuals and corporations.  As a responsible member 
018     * of the community, you are asked (but not required) to donate any 
019     * enhancements or improvements back to the community under a similar open 
020     * source license.  Thank you. -TMN
021     */
022    package groovyx.net.http;
023    
024    import groovy.lang.Closure;
025    import groovy.util.XmlSlurper;
026    import groovy.util.slurpersupport.GPathResult;
027    import groovyx.net.http.HTTPBuilder.SendDelegate;
028    
029    import java.io.IOException;
030    import java.io.InputStream;
031    import java.io.InputStreamReader;
032    import java.io.Reader;
033    import java.io.UnsupportedEncodingException;
034    import java.nio.charset.Charset;
035    import java.util.HashMap;
036    import java.util.List;
037    import java.util.Map;
038    
039    import javax.xml.parsers.ParserConfigurationException;
040    
041    import net.sf.json.JSON;
042    import net.sf.json.groovy.JsonSlurper;
043    
044    import org.apache.commons.logging.Log;
045    import org.apache.commons.logging.LogFactory;
046    import org.apache.http.HttpResponse;
047    import org.apache.http.NameValuePair;
048    import org.apache.http.client.utils.URLEncodedUtils;
049    import org.codehaus.groovy.runtime.DefaultGroovyMethods;
050    import org.codehaus.groovy.runtime.MethodClosure;
051    import org.cyberneko.html.parsers.SAXParser;
052    import org.xml.sax.SAXException;
053    
054    
055    /**
056     * <p>Keeps track of response parsers for each content type.  Each parser 
057     * should should be a closure that accepts an {@link HttpResponse} instance,
058     * and returns whatever handler is appropriate for reading the response 
059     * data for that content-type.  For example, a plain-text response should 
060     * probably be parsed with a <code>Reader</code>, while an XML response 
061     * might be parsed by an XmlSlurper, which would then be passed to the 
062     * response closure. </p>
063     * 
064     * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()}
065     * return a non-null value.  It is the job of the HTTPBuilder instance to ensure
066     * a NullPointerException is not thrown by passing a response that contains no
067     * entity.</p>
068     * 
069     * @see ContentType
070     */
071    public class ParserRegistry {
072            
073            protected Closure defaultParser = new MethodClosure( this, "parseStream" );
074            protected final Log log = LogFactory.getLog( getClass() );
075            
076            /**
077             * Helper method to get the charset from the response.  This should be done 
078             * when manually parsing any text response to ensure it is decoded using the
079             * correct charset. For instance:<pre>
080             * Reader reader = new InputStreamReader( resp.getEntity().getContent(), 
081             *   ParserRegistry.getCharset( resp ) );</pre>
082             * @param resp
083             */
084            public static String getCharset( HttpResponse resp ) {
085                    NameValuePair charset = resp.getEntity().getContentType()
086                                    .getElements()[0].getParameterByName("charset"); 
087                    return ( charset == null || charset.getValue().trim().equals("") ) ?
088                            Charset.defaultCharset().name() : charset.getValue();
089            }
090            
091            /**
092             * Helper method to get the content-type string from the response 
093             * (no charset).
094             * @param resp
095             */
096            public static String getContentType( HttpResponse resp ) {
097                    /* TODO how do we handle a very rude server who does not return a 
098                       content-type header?  It could cause an NPE here. and in getCharset */
099                    return resp.getEntity().getContentType()
100                            .getElements()[0].getName();
101            }
102            
103            /**
104             * Default parser used for binary data.
105             * @see ContentType#BINARY
106             * @param resp
107             * @return an InputStream 
108             * @throws IllegalStateException
109             * @throws IOException
110             */
111            public InputStream parseStream( HttpResponse resp ) throws IOException {
112                    return resp.getEntity().getContent();
113            }
114            
115            /**
116             * Default parser used to handle plain text data.  The response text 
117             * is decoded using the charset passed in the response content-type 
118             * header. 
119             * @see ContentType#TEXT
120             * @param resp
121             * @return
122             * @throws UnsupportedEncodingException
123             * @throws IllegalStateException
124             * @throws IOException
125             */
126            public Reader parseText( HttpResponse resp ) throws IOException {
127                    return new InputStreamReader( resp.getEntity().getContent(), 
128                                    ParserRegistry.getCharset( resp ) );
129            }
130            
131            /**
132             * Default parser used to decode a URL-encoded response.
133             * @see ContentType#URLENC
134             * @param resp
135             * @return
136             * @throws IOException
137             */
138            public Map<String,String> parseForm( HttpResponse resp ) throws IOException {
139                    List<NameValuePair> params = URLEncodedUtils.parse( resp.getEntity() );
140                    Map<String,String> paramMap = new HashMap<String,String>(params.size());
141                    for ( NameValuePair param : params ) 
142                            paramMap.put( param.getName(), param.getValue() );
143                    return paramMap;
144            }
145            
146            /**
147             * Parse an HTML document by passing it through the NekoHTML parser.
148             * @see ContentType#HTML
149             * @see SAXParser
150             * @see XmlSlurper#parse(Reader)
151             * @param resp HTTP response from which to parse content
152             * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
153             * @throws IOException
154             * @throws SAXException
155             */
156            public GPathResult parseHTML( HttpResponse resp ) throws IOException, SAXException {
157                    return new XmlSlurper( new org.cyberneko.html.parsers.SAXParser() )
158                            .parse( parseText( resp ) );
159            }
160            
161            /**
162             * Default parser used to decode an XML response.  
163             * @see ContentType#XML
164             * @see XmlSlurper#parse(Reader)
165             * @param resp HTTP response from which to parse content
166             * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
167             * @throws IOException
168             * @throws SAXException
169             * @throws ParserConfigurationException
170             */
171            public GPathResult parseXML( HttpResponse resp ) throws IOException, SAXException, ParserConfigurationException {
172                    return new XmlSlurper().parse( parseText( resp ) );
173            }
174            
175            /**
176             * Default parser used to decode a JSON response.
177             * @see ContentType#JSON
178             * @param resp
179             * @return
180             * @throws IOException
181             */
182            public JSON parseJSON( HttpResponse resp ) throws IOException {
183                    // there is a bug in the JsonSlurper.parse method...
184                    String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );                     
185                    return new JsonSlurper().parseText( jsonTxt );
186            }
187            
188            protected Map<String,Closure> registeredParsers = buildDefaultParserMap();
189            
190            /**
191             * Register a new parser for the given content-type.  The parser closure
192             * should accept an {@link HttpResponse} argument and return a type suitable
193             * to be passed to a {@link SendDelegate#getResponse() response handler}.
194             * The value returned from the parser closure is always the second parameter 
195             * of the response handler closure.  
196             * @param contentType  <code>content-type</code> string
197             * @param closure code that will parse the HttpResponse and return parsed 
198             *   data to the response handler. 
199             */
200            public void register( String contentType, Closure closure ) {
201                    registeredParsers.put( contentType, closure );
202            }
203            
204            /* Retrieve a parser for the given response content-type string.  This
205             * should usually not be called by a user.  The appropriate parser will
206             * be resolved prior to executing the response handler. 
207             * @param contentType
208             * @return parser that can interpret the given response content type,
209             *   or the default parser if no parser is registered for the given 
210             *   content-type.  It should NOT return a null value.
211             */
212            Closure get( String contentType ) { 
213                    Closure parser = registeredParsers.get(contentType);
214                    if ( parser == null ) {
215                            log.warn( "Cannot find parser for content-type: " + contentType 
216                                            + " -- using default parser.");
217                            parser = defaultParser;
218                    }
219                    return parser;
220            }
221            
222            /**
223             * Returns a map of default parsers.  Override this method to change 
224             * what parsers are registered by default.  You can of course call
225             * <code>super.buildDefaultParserMap()</code> and then add or remove 
226             * from that result as well.
227             */
228            protected Map<String,Closure> buildDefaultParserMap() {
229                    Map<String,Closure> parsers = new HashMap<String,Closure>();
230                    
231                    parsers.put( ContentType.BINARY.toString(), new MethodClosure( this, "parseStream" ) );
232                    parsers.put( ContentType.TEXT.toString(), new MethodClosure(this,"parseText") );
233                    parsers.put( ContentType.URLENC.toString(), new MethodClosure(this,"parseForm") );
234                    parsers.put( ContentType.HTML.toString(), new MethodClosure(this,"parseHTML") );
235                    
236                    Closure pClosure = new MethodClosure(this,"parseXML");
237                    for ( String ct : ContentType.XML.getContentTypeStrings() )
238                            parsers.put( ct, pClosure );
239                    
240                    pClosure = new MethodClosure(this,"parseJSON");
241                    for ( String ct : ContentType.JSON.getContentTypeStrings() )
242                            parsers.put( ct, pClosure );
243                    
244                    return parsers;
245            }
246    }