001package com.hfg.util.io; 002 003import java.net.HttpURLConnection; 004import java.net.URL; 005import java.io.BufferedReader; 006import java.io.IOException; 007import java.io.InputStreamReader; 008import java.util.ArrayList; 009import java.util.Base64; 010import java.util.List; 011import java.util.regex.Matcher; 012import java.util.regex.Pattern; 013 014import com.hfg.util.collection.CollectionUtil; 015import com.hfg.util.StringBuilderPlus; 016import com.hfg.util.StringUtil; 017 018//------------------------------------------------------------------------------ 019/** 020 * Deliniates HTTP files matching the specifed path. 021 * 022 * @author J. Alex Taylor, hairyfatguy.com 023 */ 024//------------------------------------------------------------------------------ 025// com.hfg XML/HTML Coding Library 026// 027// This library is free software; you can redistribute it and/or 028// modify it under the terms of the GNU Lesser General Public 029// License as published by the Free Software Foundation; either 030// version 2.1 of the License, or (at your option) any later version. 031// 032// This library is distributed in the hope that it will be useful, 033// but WITHOUT ANY WARRANTY; without even the implied warranty of 034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 035// Lesser General Public License for more details. 036// 037// You should have received a copy of the GNU Lesser General Public 038// License along with this library; if not, write to the Free Software 039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 040// 041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 042// jataylor@hairyfatguy.com 043//------------------------------------------------------------------------------ 044 045public class HTTPRemoteFileLister extends AbstractRemoteFileLister<HTTPRemoteFile> 046{ 047 048 //*************************************************************************** 049 // PRIVATE FIELDS 050 //*************************************************************************** 051 052 private List<HTTPRemoteFile> mRemoteFiles; 053 054 private String mUserAgentString = sDefaultUserAgentString; 055 056 private static String sDefaultUserAgentString; 057 private static final String DEFAULT_PROTOCOL = "http"; 058 059 //*************************************************************************** 060 // CONSTRUCTORS 061 //*************************************************************************** 062 063 //--------------------------------------------------------------------------- 064 public HTTPRemoteFileLister() 065 { 066 super(); 067 } 068 069 //--------------------------------------------------------------------------- 070 public HTTPRemoteFileLister(URL inFilePath) 071 { 072 super(inFilePath.toString()); 073 } 074 075 //--------------------------------------------------------------------------- 076 public HTTPRemoteFileLister(String inFilePath) 077 { 078 super(inFilePath); 079 } 080 081 //--------------------------------------------------------------------------- 082 public HTTPRemoteFileLister(String inFilePath, List<RemoteFileFilter> inFilterList) 083 { 084 super(inFilePath, inFilterList); 085 } 086 087 088 //*************************************************************************** 089 // PUBLIC METHODS 090 //*************************************************************************** 091 092 //--------------------------------------------------------------------------- 093 @Override 094 public String getProtocol() 095 { 096 return (getFilePath() != null ? getFilePath().substring(0, getFilePath().indexOf(":")) : DEFAULT_PROTOCOL); 097 } 098 099 //--------------------------------------------------------------------------- 100 public void clearRemoteFileList() 101 { 102 mRemoteFiles = null; 103 } 104 105 //--------------------------------------------------------------------------- 106 /** 107 Sets the user agent string to use during communication with the server. 108 Some sites don't want to talk to Java so some spoofing may be in order. 109 110 @param inValue The user agent string to use. 111 */ 112 public void setUserAgentString(String inValue) 113 { 114 mUserAgentString = inValue; 115 } 116 117 //--------------------------------------------------------------------------- 118 /** 119 Sets the default user agent string to use during communication with the server. 120 Some sites don't want to talk to Java so some spoofing may be in order. 121 122 @param inValue The default user agent string to use. 123 */ 124 public static void setDefaultUserAgentString(String inValue) 125 { 126 sDefaultUserAgentString = inValue; 127 } 128 129 //*************************************************************************** 130 // PROTECTED METHODS 131 //*************************************************************************** 132 133 //--------------------------------------------------------------------------- 134 protected List<HTTPRemoteFile> getUnfilteredRemoteFileListImpl() 135 { 136 if (null == mRemoteFiles) 137 { 138 mRemoteFiles = new ArrayList<>(); 139 140 List<String> urls = expandURL(getFilePath()); 141 for (String url : urls) 142 { 143 mRemoteFiles.add(new HTTPRemoteFile(url).setUserAgentString(mUserAgentString)); 144 } 145 } 146 147 return mRemoteFiles; 148 } 149 150 //*************************************************************************** 151 // PRIVATE METHODS 152 //*************************************************************************** 153 154 //--------------------------------------------------------------------------- 155 private List<String> expandURL(String inURL) 156 { 157 List<String> urls = new ArrayList<>(); 158 159 int index = inURL.indexOf("*"); 160 if (index > 0) 161 { 162 String dirURL = getDirURL(inURL, index); 163 List<String> links = getLinksInDir(dirURL); 164 165 String regexp = getDirRegExp(inURL, index); 166 String urlEnd = getURLEnd(inURL, index); 167 168 // We don't want to get caught by things like http to https redirection 169 Pattern dirPattern = Pattern.compile("(https?://" + dirURL.substring(dirURL.indexOf("://") + 3)+ ")"); 170 171 if (CollectionUtil.hasValues(links)) 172 { 173 for (String link : links) 174 { 175 Matcher m = dirPattern.matcher(link); 176 if (m.find()) 177 { 178 link = link.substring(m.group(1).length()); 179 } 180 181 if (link.matches(regexp)) 182 { 183 if (urlEnd.length() > 0) 184 { 185 urls.addAll(expandURL(dirURL + link.substring(0, link.indexOf("/")) + (urlEnd.startsWith("/") ? "" : "/") + urlEnd)); 186 } 187 else 188 { 189 urls.add(dirURL + link); 190 } 191 } 192 } 193 } 194 } 195 else 196 { 197 urls.add(inURL); 198 } 199 200 return urls; 201 } 202 203 //--------------------------------------------------------------------------- 204 private String getDirURL(String inURL, int inIndex) 205 { 206 String dir = inURL.substring(0, inIndex); 207 int slashIndex = dir.lastIndexOf('/'); 208 if (slashIndex > 0) 209 { 210 dir = dir.substring(0, slashIndex + 1); 211 } 212 213 return dir; 214 } 215 216 //--------------------------------------------------------------------------- 217 private String getDirRegExp(String inURL, int inIndex) 218 { 219 String dirNameLeftOfWildcard = inURL.substring(0, inIndex); 220 int slashIndex = dirNameLeftOfWildcard.lastIndexOf('/'); 221 if (slashIndex >= 0) 222 { 223 dirNameLeftOfWildcard = dirNameLeftOfWildcard.substring(slashIndex + 1); 224 } 225 226 String dirNameRightOfWildcard = inURL.substring(inIndex); 227 slashIndex = dirNameRightOfWildcard.indexOf('/'); 228 if (slashIndex >= 0) 229 { 230 dirNameRightOfWildcard = dirNameRightOfWildcard.substring(0, slashIndex); 231 } 232 233 String regexp = dirNameLeftOfWildcard + dirNameRightOfWildcard; 234 235 if (slashIndex > 0) 236 { 237 regexp += "(?:/|" + inURL.substring(inIndex).substring(slashIndex) + "|$)"; 238 } 239 240 regexp = StringUtil.replaceAll(regexp, ".", "\\."); 241 regexp = StringUtil.replaceAll(regexp, "*", ".*"); 242 243 return regexp; 244 } 245 246 //--------------------------------------------------------------------------- 247 private String getURLEnd(String inURL, int inIndex) 248 { 249 String snippet = inURL.substring(inIndex); 250 int slashIndex = snippet.indexOf('/'); 251 if (slashIndex >= 0) 252 { 253 snippet = snippet.substring(slashIndex); 254 } 255 else 256 { 257 snippet = ""; 258 } 259 260 return snippet; 261 } 262 263 //--------------------------------------------------------------------------- 264 private HttpURLConnection openConnection(String inURL) 265 throws IOException 266 { 267 HttpURLConnection conn = (HttpURLConnection) new URL(inURL).openConnection(); 268 269 // Pretend to be a browser. Some sites don't want to talk to Java 270 if (StringUtil.isSet(mUserAgentString)) 271 { 272 conn.setRequestProperty("User-Agent", mUserAgentString); 273 } 274 275 if (getCredentials() != null) 276 { 277 String base64Auth = Base64.getEncoder().encodeToString((getCredentials().getUser() + ":" + new String(getCredentials().getPassword())).getBytes()); 278 conn.setRequestProperty("Authorization", "Basic "+ base64Auth); 279 } 280 281 conn.connect(); 282 283 return conn; 284 } 285 286 //--------------------------------------------------------------------------- 287 private List<String> getLinksInDir(String inDirURL) 288 { 289 List<String> links = new ArrayList<>(); 290 291 try 292 { 293 String url = inDirURL; 294 295 HttpURLConnection conn = null; 296 int numAttempts = 0; 297 while (numAttempts < 3) 298 { 299 conn = openConnection(url); 300 if (conn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM 301 || conn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) 302 { 303 url = conn.getHeaderField("Location"); 304 numAttempts++; 305 } 306 else 307 { 308 break; 309 } 310 } 311 312 313// System.out.println("RESPONSE CODE: " + conn.getResponseCode()); 314// System.out.println("RESPONSE MSG: " + conn.getResponseMessage()); 315// System.out.println("CONTENT LENGTH: " + conn.getContentLength()); 316// System.out.println("CONTENT TYPE: " + conn.getContentType()); 317 318 if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) 319 { 320 BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); 321 String line; 322 StringBuilderPlus dirHTML = new StringBuilderPlus(); 323 while ((line = reader.readLine()) != null) 324 { 325 dirHTML.appendln(line); 326 } 327 reader.close(); 328 329 // Painful but we need to scrape the page looking for the directory's contents. 330 // (Assuming it IS a directory.) 331 Pattern p = Pattern.compile("<a.+?href=[\\\"\\\'](.+?)[\\\"\\\']", 332 Pattern.CASE_INSENSITIVE); 333 334 Matcher m = p.matcher(dirHTML); 335 while (m.find()) 336 { 337 links.add(m.group(1)); 338 } 339 } 340 } 341 catch (IOException e) 342 { 343 throw new RuntimeException(e.toString()); 344 } 345 346 return links; 347 } 348}