Home | History | Annotate | Download | only in firefox
      1 #!/usr/bin/env python
      2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Does scraping for Firefox 2.0."""
      7 
      8 import pywintypes
      9 import time
     10 import types
     11 
     12 from drivers import keyboard
     13 from drivers import mouse
     14 from drivers import windowing
     15 
     16 # Default version
     17 version = "2.0.0.6"
     18 
     19 DEFAULT_PATH = r"c:\program files\mozilla firefox\firefox.exe"
     20 
     21 # TODO(jhaas): the Firefox scraper is a bit rickety at the moment. Known
     22 # issues: 1) won't work if the default profile puts toolbars in different
     23 # locations, 2) uses sleep() statements rather than more robust checks,
     24 # 3) fails badly if an existing Firefox window is open when the scrape
     25 # is invoked. This needs to be fortified at some point.
     26 
     27 def GetBrowser(path):
     28   """Invoke the Firefox browser and return the process and window.
     29 
     30   Args:
     31     path: full path to browser
     32 
     33   Returns:
     34     A tuple of (process handle, render pane)
     35   """
     36   if not path: path = DEFAULT_PATH
     37 
     38   # Invoke Firefox
     39   (proc, wnd) = windowing.InvokeAndWait(path)
     40 
     41   # Get the content pane
     42   render_pane = windowing.FindChildWindow(
     43     wnd,
     44     "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
     45 
     46   return (proc, wnd, render_pane)
     47 
     48 
     49 def InvokeBrowser(path):
     50   """Invoke the Firefox browser.
     51 
     52   Args:
     53     path: full path to browser
     54 
     55   Returns:
     56     A tuple of (main window, process handle, render pane)
     57   """
     58   # Reuse an existing instance of the browser if we can find one. This
     59   # may not work correctly, especially if the window is behind other windows.
     60   wnds = windowing.FindChildWindows(0, "MozillaUIWindowClass")
     61   if len(wnds):
     62     wnd = wnds[0]
     63     proc = None
     64   else:
     65     # Invoke Firefox
     66     (proc, wnd) = windowing.InvokeAndWait(path)
     67 
     68   # Get the content pane
     69   render_pane = windowing.FindChildWindow(
     70     wnd,
     71     "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
     72 
     73   return (wnd, proc, render_pane)
     74 
     75 
     76 def Scrape(urls, outdir, size, pos, timeout=20, **kwargs):
     77   """Invoke a browser, send it to a series of URLs, and save its output.
     78 
     79   Args:
     80     urls: list of URLs to scrape
     81     outdir: directory to place output
     82     size: size of browser window to use
     83     pos: position of browser window
     84     timeout: amount of time to wait for page to load
     85     kwargs: miscellaneous keyword args
     86 
     87   Returns:
     88     None if success, else an error string
     89   """
     90   if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
     91   else: path = DEFAULT_PATH
     92 
     93   (wnd, proc, render_pane) = InvokeBrowser(path)
     94 
     95   # Resize and reposition the frame
     96   windowing.MoveAndSizeWindow(wnd, pos, size, render_pane)
     97 
     98   time.sleep(3)
     99 
    100   # Firefox is a bit of a pain: it doesn't use standard edit controls,
    101   # and it doesn't display a throbber when there's no tab. Let's make
    102   # sure there's at least one tab, then select the first one
    103 
    104   mouse.ClickInWindow(wnd)
    105   keyboard.TypeString("[t]", True)
    106   mouse.ClickInWindow(wnd, (30, 115))
    107   time.sleep(2)
    108 
    109   timedout = False
    110 
    111   # Visit each URL we're given
    112   if type(urls) in types.StringTypes: urls = [urls]
    113 
    114   for url in urls:
    115 
    116     # Use keyboard shortcuts
    117     keyboard.TypeString("{d}", True)
    118     keyboard.TypeString(url)
    119     keyboard.TypeString("\n")
    120 
    121     # Wait for the page to finish loading
    122     load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout)
    123     timedout = load_time < 0
    124 
    125     if timedout:
    126       break
    127 
    128     # Scrape the page
    129     image = windowing.ScrapeWindow(render_pane)
    130 
    131     # Save to disk
    132     if "filename" in kwargs:
    133       if callable(kwargs["filename"]):
    134         filename = kwargs["filename"](url)
    135       else:
    136         filename = kwargs["filename"]
    137     else:
    138       filename = windowing.URLtoFilename(url, outdir, ".bmp")
    139     image.save(filename)
    140 
    141   # Close all the tabs, cheesily
    142   mouse.ClickInWindow(wnd)
    143 
    144   while len(windowing.FindChildWindows(0, "MozillaUIWindowClass")):
    145     keyboard.TypeString("[w]", True)
    146     time.sleep(1)
    147 
    148   if timedout:
    149     return "timeout"
    150 
    151 
    152 def Time(urls, size, timeout, **kwargs):
    153   """Measure how long it takes to load each of a series of URLs
    154 
    155   Args:
    156     urls: list of URLs to time
    157     size: size of browser window to use
    158     timeout: amount of time to wait for page to load
    159     kwargs: miscellaneous keyword args
    160 
    161   Returns:
    162     A list of tuples (url, time). "time" can be "crashed" or "timeout"
    163   """
    164   if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
    165   else: path = DEFAULT_PATH
    166   proc = None
    167 
    168   # Visit each URL we're given
    169   if type(urls) in types.StringTypes: urls = [urls]
    170 
    171   ret = []
    172   for url in urls:
    173     try:
    174       # Invoke the browser if necessary
    175       if not proc:
    176         (wnd, proc, render_pane) = InvokeBrowser(path)
    177 
    178         # Resize and reposition the frame
    179         windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane)
    180 
    181         time.sleep(3)
    182 
    183         # Firefox is a bit of a pain: it doesn't use standard edit controls,
    184         # and it doesn't display a throbber when there's no tab. Let's make
    185         # sure there's at least one tab, then select the first one
    186 
    187         mouse.ClickInWindow(wnd)
    188         keyboard.TypeString("[t]", True)
    189         mouse.ClickInWindow(wnd, (30, 115))
    190         time.sleep(2)
    191 
    192       # Use keyboard shortcuts
    193       keyboard.TypeString("{d}", True)
    194       keyboard.TypeString(url)
    195       keyboard.TypeString("\n")
    196 
    197       # Wait for the page to finish loading
    198       load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout)
    199       timedout = load_time < 0
    200 
    201       if timedout:
    202         load_time = "timeout"
    203 
    204         # Try to close the browser; if this fails it's probably a crash
    205         mouse.ClickInWindow(wnd)
    206 
    207         count = 0
    208         while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass"))
    209           and count < 5):
    210           keyboard.TypeString("[w]", True)
    211           time.sleep(1)
    212           count = count + 1
    213 
    214         if len(windowing.FindChildWindows(0, "MozillaUIWindowClass")):
    215           windowing.EndProcess(proc)
    216           load_time = "crashed"
    217 
    218         proc = None
    219     except pywintypes.error:
    220       proc = None
    221       load_time = "crashed"
    222 
    223     ret.append( (url, load_time) )
    224 
    225   if proc:
    226     count = 0
    227     while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass"))
    228       and count < 5):
    229       keyboard.TypeString("[w]", True)
    230       time.sleep(1)
    231       count = count + 1
    232   return ret
    233 
    234 
    235 def main():
    236   # We're being invoked rather than imported, so run some tests
    237   path = r"c:\sitecompare\scrapes\Firefox\2.0.0.6"
    238   windowing.PreparePath(path)
    239 
    240   # Scrape three sites and save the results
    241   Scrape(
    242     ["http://www.microsoft.com", "http://www.google.com",
    243      "http://www.sun.com"],
    244     path, (1024, 768), (0, 0))
    245   return 0
    246 
    247 
    248 if __name__ == "__main__":
    249   sys.exit(main())
    250