001package com.hfg.util.io;
002
003import java.net.HttpURLConnection;
004import java.net.URL;
005import java.io.BufferedReader;
006import java.io.IOException;
007import java.io.InputStreamReader;
008import java.util.ArrayList;
009import java.util.Base64;
010import java.util.List;
011import java.util.regex.Matcher;
012import java.util.regex.Pattern;
013
014import com.hfg.util.collection.CollectionUtil;
015import com.hfg.util.StringBuilderPlus;
016import com.hfg.util.StringUtil;
017
018//------------------------------------------------------------------------------
019/**
020 * Deliniates HTTP files matching the specifed path.
021 *
022 * @author J. Alex Taylor, hairyfatguy.com
023 */
024//------------------------------------------------------------------------------
025// com.hfg XML/HTML Coding Library
026//
027// This library is free software; you can redistribute it and/or
028// modify it under the terms of the GNU Lesser General Public
029// License as published by the Free Software Foundation; either
030// version 2.1 of the License, or (at your option) any later version.
031//
032// This library is distributed in the hope that it will be useful,
033// but WITHOUT ANY WARRANTY; without even the implied warranty of
034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035// Lesser General Public License for more details.
036//
037// You should have received a copy of the GNU Lesser General Public
038// License along with this library; if not, write to the Free Software
039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
040//
041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
042// jataylor@hairyfatguy.com
043//------------------------------------------------------------------------------
044
045public class HTTPRemoteFileLister extends AbstractRemoteFileLister<HTTPRemoteFile>
046{
047
048   //***************************************************************************
049   // PRIVATE FIELDS
050   //***************************************************************************
051
052   private List<HTTPRemoteFile>      mRemoteFiles;
053
054   private String mUserAgentString = sDefaultUserAgentString;
055
056   private static String sDefaultUserAgentString;
057   private static final String DEFAULT_PROTOCOL = "http";
058
059   //***************************************************************************
060   // CONSTRUCTORS
061   //***************************************************************************
062
063   //---------------------------------------------------------------------------
064   public HTTPRemoteFileLister()
065   {
066      super();
067   }
068
069   //---------------------------------------------------------------------------
070   public HTTPRemoteFileLister(URL inFilePath)
071   {
072      super(inFilePath.toString());
073   }
074
075   //---------------------------------------------------------------------------
076   public HTTPRemoteFileLister(String inFilePath)
077   {
078      super(inFilePath);
079   }
080
081   //---------------------------------------------------------------------------
082   public HTTPRemoteFileLister(String inFilePath, List<RemoteFileFilter> inFilterList)
083   {
084      super(inFilePath, inFilterList);
085   }
086
087
088   //***************************************************************************
089   // PUBLIC METHODS
090   //***************************************************************************
091
092   //---------------------------------------------------------------------------
093   @Override
094   public String getProtocol()
095   {
096      return (getFilePath() != null ? getFilePath().substring(0, getFilePath().indexOf(":")) : DEFAULT_PROTOCOL);
097   }
098
099   //---------------------------------------------------------------------------
100   public void clearRemoteFileList()
101   {
102      mRemoteFiles = null;
103   }
104
105   //---------------------------------------------------------------------------
106   /**
107    Sets the user agent string to use during communication with the server.
108    Some sites don't want to talk to Java so some spoofing may be in order.
109
110    @param inValue The user agent string to use.
111    */
112   public void setUserAgentString(String inValue)
113   {
114      mUserAgentString = inValue;
115   }
116
117   //---------------------------------------------------------------------------
118   /**
119    Sets the default user agent string to use during communication with the server.
120    Some sites don't want to talk to Java so some spoofing may be in order.
121
122    @param inValue The default user agent string to use.
123    */
124   public static void setDefaultUserAgentString(String inValue)
125   {
126      sDefaultUserAgentString = inValue;
127   }
128
129   //***************************************************************************
130   // PROTECTED METHODS
131   //***************************************************************************
132
133   //---------------------------------------------------------------------------
134   protected List<HTTPRemoteFile> getUnfilteredRemoteFileListImpl()
135   {
136      if (null == mRemoteFiles)
137      {
138         mRemoteFiles = new ArrayList<>();
139
140         List<String> urls = expandURL(getFilePath());
141         for (String url : urls)
142         {
143            mRemoteFiles.add(new HTTPRemoteFile(url).setUserAgentString(mUserAgentString));
144         }
145      }
146
147      return mRemoteFiles;
148   }
149
150   //***************************************************************************
151   // PRIVATE METHODS
152   //***************************************************************************
153
154   //---------------------------------------------------------------------------
155   private List<String> expandURL(String inURL)
156   {
157      List<String> urls = new ArrayList<>();
158
159      int index = inURL.indexOf("*");
160      if (index > 0)
161      {
162         String dirURL = getDirURL(inURL, index);
163         List<String> links = getLinksInDir(dirURL);
164
165         String regexp = getDirRegExp(inURL, index);
166         String urlEnd = getURLEnd(inURL, index);
167
168         // We don't want to get caught by things like http to https redirection
169         Pattern dirPattern = Pattern.compile("(https?://" + dirURL.substring(dirURL.indexOf("://") + 3)+ ")");
170
171         if (CollectionUtil.hasValues(links))
172         {
173            for (String link : links)
174            {
175               Matcher m = dirPattern.matcher(link);
176               if (m.find())
177               {
178                  link = link.substring(m.group(1).length());
179               }
180
181               if (link.matches(regexp))
182               {
183                  if (urlEnd.length() > 0)
184                  {
185                     urls.addAll(expandURL(dirURL + link.substring(0, link.indexOf("/")) + (urlEnd.startsWith("/") ? "" : "/") + urlEnd));
186                  }
187                  else
188                  {
189                     urls.add(dirURL + link);
190                  }
191               }
192            }
193         }
194      }
195      else
196      {
197         urls.add(inURL);
198      }
199
200      return urls;
201   }
202
203   //---------------------------------------------------------------------------
204   private String getDirURL(String inURL, int inIndex)
205   {
206      String dir = inURL.substring(0, inIndex);
207      int slashIndex = dir.lastIndexOf('/');
208      if (slashIndex > 0)
209      {
210         dir = dir.substring(0, slashIndex + 1);
211      }
212
213      return dir;
214   }
215
216   //---------------------------------------------------------------------------
217   private String getDirRegExp(String inURL, int inIndex)
218   {
219      String dirNameLeftOfWildcard = inURL.substring(0, inIndex);
220      int slashIndex = dirNameLeftOfWildcard.lastIndexOf('/');
221      if (slashIndex >= 0)
222      {
223         dirNameLeftOfWildcard = dirNameLeftOfWildcard.substring(slashIndex + 1);
224      }
225
226      String dirNameRightOfWildcard = inURL.substring(inIndex);
227      slashIndex = dirNameRightOfWildcard.indexOf('/');
228      if (slashIndex >= 0)
229      {
230         dirNameRightOfWildcard = dirNameRightOfWildcard.substring(0, slashIndex);
231      }
232
233      String regexp = dirNameLeftOfWildcard + dirNameRightOfWildcard;
234
235      if (slashIndex > 0)
236      {
237         regexp += "(?:/|" + inURL.substring(inIndex).substring(slashIndex) + "|$)";
238      }
239
240      regexp = StringUtil.replaceAll(regexp, ".", "\\.");
241      regexp = StringUtil.replaceAll(regexp, "*", ".*");
242
243      return regexp;
244   }
245
246   //---------------------------------------------------------------------------
247   private String getURLEnd(String inURL, int inIndex)
248   {
249      String snippet = inURL.substring(inIndex);
250      int slashIndex = snippet.indexOf('/');
251      if (slashIndex >= 0)
252      {
253         snippet = snippet.substring(slashIndex);
254      }
255      else
256      {
257         snippet = "";
258      }
259
260      return snippet;
261   }
262
263   //---------------------------------------------------------------------------
264   private HttpURLConnection openConnection(String inURL)
265      throws IOException
266   {
267      HttpURLConnection conn = (HttpURLConnection) new URL(inURL).openConnection();
268
269      // Pretend to be a browser. Some sites don't want to talk to Java
270      if (StringUtil.isSet(mUserAgentString))
271      {
272         conn.setRequestProperty("User-Agent", mUserAgentString);
273      }
274
275      if (getCredentials() != null)
276      {
277         String base64Auth = Base64.getEncoder().encodeToString((getCredentials().getUser() + ":" + new String(getCredentials().getPassword())).getBytes());
278         conn.setRequestProperty("Authorization", "Basic "+ base64Auth);
279      }
280
281      conn.connect();
282
283      return conn;
284   }
285
286   //---------------------------------------------------------------------------
287   private List<String> getLinksInDir(String inDirURL)
288   {
289      List<String> links = new ArrayList<>();
290
291      try
292      {
293         String url = inDirURL;
294
295         HttpURLConnection conn = null;
296         int numAttempts = 0;
297         while (numAttempts < 3)
298         {
299            conn = openConnection(url);
300            if (conn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
301                || conn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
302            {
303               url = conn.getHeaderField("Location");
304               numAttempts++;
305            }
306            else
307            {
308               break;
309            }
310         }
311
312
313//         System.out.println("RESPONSE CODE: " + conn.getResponseCode());
314//         System.out.println("RESPONSE MSG: " + conn.getResponseMessage());
315//         System.out.println("CONTENT LENGTH: " + conn.getContentLength());
316//         System.out.println("CONTENT TYPE: " + conn.getContentType());
317
318         if (conn.getResponseCode() == HttpURLConnection.HTTP_OK)
319         {
320            BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
321            String line;
322            StringBuilderPlus dirHTML = new StringBuilderPlus();
323            while ((line = reader.readLine()) != null)
324            {
325               dirHTML.appendln(line);
326            }
327            reader.close();
328
329            // Painful but we need to scrape the page looking for the directory's contents.
330            // (Assuming it IS a directory.)
331            Pattern p = Pattern.compile("<a.+?href=[\\\"\\\'](.+?)[\\\"\\\']",
332                                        Pattern.CASE_INSENSITIVE);
333
334            Matcher m = p.matcher(dirHTML);
335            while (m.find())
336            {
337               links.add(m.group(1));
338            }
339         }
340      }
341      catch (IOException e)
342      {
343         throw new RuntimeException(e.toString());
344      }
345
346      return links;
347   }
348}