1   /*
2    * FetchAsGooglebot - Fetch as Googlebot
3    * Copyright (C) 2009 Christian Schenk
4    *
5    * This program is free software; you can redistribute it and/or
6    * modify it under the terms of the GNU General Public License
7    * as published by the Free Software Foundation; either version 2
8    * of the License, or (at your option) any later version.
9    * 
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   * 
15   * You should have received a copy of the GNU General Public License
16   * along with this program; if not, write to the Free Software
17   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
18   */
19  package org.christianschenk.fetchasgooglebot;
20  
21  import org.apache.http.Header;
22  import org.apache.http.HttpEntity;
23  import org.apache.http.HttpResponse;
24  import org.apache.http.client.HttpClient;
25  import org.apache.http.client.methods.HttpGet;
26  import org.apache.http.impl.client.DefaultHttpClient;
27  import org.apache.http.params.CoreProtocolPNames;
28  import org.apache.http.util.EntityUtils;
29  
30  /**
31   * Tries to reimplement the "Fetch as Googlebot" feature, a part of Google's "Webmaster Tools".
32   * 
33   * @author Christian Schenk
34   */
35  public class MyFetchAsGooglebot {
36  
37  	private final HttpClient httpclient;
38  
39  	public MyFetchAsGooglebot() {
40  		this.httpclient = new DefaultHttpClient();
41  	}
42  
43  	/**
44  	 * Fetches the given URL and prints the HTTP header fields and the content.
45  	 */
46  	public void fetchUrl(final String url) {
47  		try {
48  			final HttpGet httpget = new HttpGet(url);
49  			httpget.getParams().setParameter(CoreProtocolPNames.USER_AGENT, "Googlebot");
50  
51  			final HttpResponse response = this.httpclient.execute(httpget);
52  			final HttpEntity entity = response.getEntity();
53  
54  			if (entity == null) throw new RuntimeException("Entity is null");
55  
56  			/*
57  			 * print the HTTP header fields
58  			 */
59  			System.out.println(response.getStatusLine().toString());
60  			for (final Header header : response.getAllHeaders()) {
61  				System.out.println(header.toString());
62  			}
63  
64  			System.out.println("");
65  
66  			/*
67  			 * print the content
68  			 */
69  			System.out.println(EntityUtils.toString(entity));
70  		} catch (final Exception ex) {
71  			throw new RuntimeException(ex);
72  		}
73  	}
74  
75  	public static void main(final String[] args) {
76  		final MyFetchAsGooglebot myFaG = new MyFetchAsGooglebot();
77  
78  		final long start = System.currentTimeMillis();
79  		myFaG.fetchUrl("http://www.google.com");
80  		final long end = System.currentTimeMillis();
81  
82  		// Time it took to download the content
83  		System.out.println("\nThis took " + (end - start) + " ms");
84  
85  	}
86  }