telemetry/page/page_set_archive_info.py

# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import json
import logging
import os
import re
import shutil

from telemetry.page import cloud_storage


def _UpdateHashFile(file_path):
  with open(file_path + '.sha1', 'wb') as f:
    f.write(cloud_storage.GetHash(file_path))
    f.flush()


class PageSetArchiveInfo(object):
  def __init__(self, archive_data_file_path, page_set_file_path, data):
    self._archive_data_file_path = archive_data_file_path
    self._archive_data_file_dir = os.path.dirname(archive_data_file_path)
    # Back pointer to the page set file.
    self._page_set_file_path = page_set_file_path

    for archive_path in data['archives']:
      cloud_storage.GetIfChanged(cloud_storage.DEFAULT_BUCKET, archive_path)

    # Map from the relative path (as it appears in the metadata file) of the
    # .wpr file to a list of urls it supports.
    self._wpr_file_to_urls = data['archives']

    # Map from the page url to a relative path (as it appears in the metadata
    # file) of the .wpr file.
    self._url_to_wpr_file = dict()
    # Find out the wpr file names for each page.
    for wpr_file in data['archives']:
      page_urls = data['archives'][wpr_file]
      for url in page_urls:
        self._url_to_wpr_file[url] = wpr_file
    self.temp_target_wpr_file_path = None

  @classmethod
  def FromFile(cls, file_path, page_set_file_path):
    cloud_storage.GetIfChanged(cloud_storage.DEFAULT_BUCKET, file_path)

    if os.path.exists(file_path):
      with open(file_path, 'r') as f:
        data = json.load(f)
        return cls(file_path, page_set_file_path, data)
    return cls(file_path, page_set_file_path, {'archives': {}})

  def WprFilePathForPage(self, page):
    if self.temp_target_wpr_file_path:
      return self.temp_target_wpr_file_path
    wpr_file = self._url_to_wpr_file.get(page.url, None)
    if wpr_file:
      return self._WprFileNameToPath(wpr_file)
    return None

  def AddNewTemporaryRecording(self, temp_target_wpr_file_path):
    self.temp_target_wpr_file_path = temp_target_wpr_file_path

  def AddRecordedPages(self, urls):
    (target_wpr_file, target_wpr_file_path) = self._NextWprFileName()
    for url in urls:
      self._SetWprFileForPage(url, target_wpr_file)
    shutil.move(self.temp_target_wpr_file_path, target_wpr_file_path)
    _UpdateHashFile(target_wpr_file_path)
    self._WriteToFile()
    self._DeleteAbandonedWprFiles()

  def _DeleteAbandonedWprFiles(self):
    # Update the metadata so that the abandoned wpr files don't have empty url
    # arrays.
    abandoned_wpr_files = self._AbandonedWprFiles()
    for wpr_file in abandoned_wpr_files:
      del self._wpr_file_to_urls[wpr_file]
      # Don't fail if we're unable to delete some of the files.
      wpr_file_path = self._WprFileNameToPath(wpr_file)
      try:
        os.remove(wpr_file_path)
      except Exception:
        logging.warning('Failed to delete file: %s' % wpr_file_path)

  def _AbandonedWprFiles(self):
    abandoned_wpr_files = []
    for wpr_file, urls in self._wpr_file_to_urls.iteritems():
      if not urls:
        abandoned_wpr_files.append(wpr_file)
    return abandoned_wpr_files

  def _WriteToFile(self):
    """Writes the metadata into the file passed as constructor parameter."""
    metadata = dict()
    metadata['description'] = (
        'Describes the Web Page Replay archives for a page set. Don\'t edit by '
        'hand! Use record_wpr for updating.')
    # Pointer from the metadata to the page set .json file.
    metadata['page_set'] = os.path.relpath(self._page_set_file_path,
                                           self._archive_data_file_dir)
    metadata['archives'] = self._wpr_file_to_urls.copy()
    # Don't write data for abandoned archives.
    abandoned_wpr_files = self._AbandonedWprFiles()
    for wpr_file in abandoned_wpr_files:
      del metadata['archives'][wpr_file]

    with open(self._archive_data_file_path, 'w') as f:
      json.dump(metadata, f, indent=4)
      f.flush()
    _UpdateHashFile(self._archive_data_file_path)

  def _WprFileNameToPath(self, wpr_file):
    return os.path.abspath(os.path.join(self._archive_data_file_dir, wpr_file))

  def _NextWprFileName(self):
    """Creates a new file name for a wpr archive file."""
    # The names are of the format "some_thing_number.wpr". Read the numbers.
    highest_number = -1
    base = None
    for wpr_file in self._wpr_file_to_urls:
      match = re.match(r'(?P<BASE>.*)_(?P<NUMBER>[0-9]+)\.wpr', wpr_file)
      if not match:
        raise Exception('Illegal wpr file name ' + wpr_file)
      highest_number = max(int(match.groupdict()['NUMBER']), highest_number)
      if base and match.groupdict()['BASE'] != base:
        raise Exception('Illegal wpr file name ' + wpr_file +
                        ', doesn\'t begin with ' + base)
      base = match.groupdict()['BASE']
    if not base:
      # If we're creating a completely new info file, use the base name of the
      # page set file.
      base = os.path.splitext(os.path.basename(self._page_set_file_path))[0]
    new_filename = '%s_%03d.wpr' % (base, highest_number + 1)
    return new_filename, self._WprFileNameToPath(new_filename)

  def _SetWprFileForPage(self, url, wpr_file):
    """For modifying the metadata when we're going to record a new archive."""
    old_wpr_file = self._url_to_wpr_file.get(url, None)
    if old_wpr_file:
      self._wpr_file_to_urls[old_wpr_file].remove(url)
    self._url_to_wpr_file[url] = wpr_file
    if wpr_file not in self._wpr_file_to_urls:
      self._wpr_file_to_urls[wpr_file] = []
    self._wpr_file_to_urls[wpr_file].append(url)