Home | History | Annotate | Download | only in media
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This file contains an implementation of VideoDecodeAccelerator
      6 // that utilizes hardware video decoders, which expose Video4Linux 2 API
      7 // (http://linuxtv.org/downloads/v4l-dvb-apis/).
      8 
      9 #ifndef CONTENT_COMMON_GPU_MEDIA_V4L2_VIDEO_DECODE_ACCELERATOR_H_
     10 #define CONTENT_COMMON_GPU_MEDIA_V4L2_VIDEO_DECODE_ACCELERATOR_H_
     11 
     12 #include <queue>
     13 #include <vector>
     14 
     15 #include "base/callback_forward.h"
     16 #include "base/memory/linked_ptr.h"
     17 #include "base/memory/scoped_ptr.h"
     18 #include "base/synchronization/waitable_event.h"
     19 #include "base/threading/thread.h"
     20 #include "content/common/content_export.h"
     21 #include "content/common/gpu/media/v4l2_video_device.h"
     22 #include "media/base/limits.h"
     23 #include "media/base/video_decoder_config.h"
     24 #include "media/video/picture.h"
     25 #include "media/video/video_decode_accelerator.h"
     26 #include "ui/gfx/size.h"
     27 #include "ui/gl/gl_bindings.h"
     28 
     29 namespace base {
     30 class MessageLoopProxy;
     31 }  // namespace base
     32 
     33 namespace media {
     34 class H264Parser;
     35 }  // namespace media
     36 
     37 namespace content {
     38 // This class handles video accelerators directly through a V4L2 device exported
     39 // by the hardware blocks.
     40 //
     41 // The threading model of this class is driven by the fact that it needs to
     42 // interface two fundamentally different event queues -- the one Chromium
     43 // provides through MessageLoop, and the one driven by the V4L2 devices which
     44 // is waited on with epoll().  There are three threads involved in this class:
     45 //
     46 // * The child thread, which is the main GPU process thread which calls the
     47 //   media::VideoDecodeAccelerator entry points.  Calls from this thread
     48 //   generally do not block (with the exception of Initialize() and Destroy()).
     49 //   They post tasks to the decoder_thread_, which actually services the task
     50 //   and calls back when complete through the
     51 //   media::VideoDecodeAccelerator::Client interface.
     52 // * The decoder_thread_, owned by this class.  It services API tasks, through
     53 //   the *Task() routines, as well as V4L2 device events, through
     54 //   ServiceDeviceTask().  Almost all state modification is done on this thread
     55 //   (this doesn't include buffer (re)allocation sequence, see below).
     56 // * The device_poll_thread_, owned by this class.  All it does is epoll() on
     57 //   the V4L2 in DevicePollTask() and schedule a ServiceDeviceTask() on the
     58 //   decoder_thread_ when something interesting happens.
     59 //   TODO(sheu): replace this thread with an TYPE_IO decoder_thread_.
     60 //
     61 // Note that this class has (almost) no locks, apart from the pictures_assigned_
     62 // WaitableEvent. Everything (apart from buffer (re)allocation) is serviced on
     63 // the decoder_thread_, so there are no synchronization issues.
     64 // ... well, there are, but it's a matter of getting messages posted in the
     65 // right order, not fiddling with locks.
     66 // Buffer creation is a two-step process that is serviced partially on the
     67 // Child thread, because we need to wait for the client to provide textures
     68 // for the buffers we allocate. We cannot keep the decoder thread running while
     69 // the client allocates Pictures for us, because we need to REQBUFS first to get
     70 // the required number of output buffers from the device and that cannot be done
     71 // unless we free the previous set of buffers, leaving the decoding in a
     72 // inoperable state for the duration of the wait for Pictures. So to prevent
     73 // subtle races (esp. if we get Reset() in the meantime), we block the decoder
     74 // thread while we wait for AssignPictureBuffers from the client.
     75 class CONTENT_EXPORT V4L2VideoDecodeAccelerator
     76     : public media::VideoDecodeAccelerator {
     77  public:
     78   V4L2VideoDecodeAccelerator(
     79       EGLDisplay egl_display,
     80       EGLContext egl_context,
     81       const base::WeakPtr<Client>& io_client_,
     82       const base::Callback<bool(void)>& make_context_current,
     83       scoped_ptr<V4L2Device> device,
     84       const scoped_refptr<base::MessageLoopProxy>& io_message_loop_proxy);
     85   virtual ~V4L2VideoDecodeAccelerator();
     86 
     87   // media::VideoDecodeAccelerator implementation.
     88   // Note: Initialize() and Destroy() are synchronous.
     89   virtual bool Initialize(media::VideoCodecProfile profile,
     90                           Client* client) OVERRIDE;
     91   virtual void Decode(const media::BitstreamBuffer& bitstream_buffer) OVERRIDE;
     92   virtual void AssignPictureBuffers(
     93       const std::vector<media::PictureBuffer>& buffers) OVERRIDE;
     94   virtual void ReusePictureBuffer(int32 picture_buffer_id) OVERRIDE;
     95   virtual void Flush() OVERRIDE;
     96   virtual void Reset() OVERRIDE;
     97   virtual void Destroy() OVERRIDE;
     98   virtual bool CanDecodeOnIOThread() OVERRIDE;
     99 
    100  private:
    101   // These are rather subjectively tuned.
    102   enum {
    103     kInputBufferCount = 8,
    104     // TODO(posciak): determine input buffer size based on level limits.
    105     // See http://crbug.com/255116.
    106     // Input bitstream buffer size for up to 1080p streams.
    107     kInputBufferMaxSizeFor1080p = 1024 * 1024,
    108     // Input bitstream buffer size for up to 4k streams.
    109     kInputBufferMaxSizeFor4k = 4 * kInputBufferMaxSizeFor1080p,
    110     // Number of output buffers to use for each VDA stage above what's required
    111     // by the decoder (e.g. DPB size, in H264).  We need
    112     // media::limits::kMaxVideoFrames to fill up the GpuVideoDecode pipeline,
    113     // and +1 for a frame in transit.
    114     kDpbOutputBufferExtraCount = media::limits::kMaxVideoFrames + 1,
    115   };
    116 
    117   // Internal state of the decoder.
    118   enum State {
    119     kUninitialized,      // Initialize() not yet called.
    120     kInitialized,        // Initialize() returned true; ready to start decoding.
    121     kDecoding,           // DecodeBufferInitial() successful; decoding frames.
    122     kResetting,          // Presently resetting.
    123     kAfterReset,         // After Reset(), ready to start decoding again.
    124     kChangingResolution, // Performing resolution change, all remaining
    125                          // pre-change frames decoded and processed.
    126     kError,              // Error in kDecoding state.
    127   };
    128 
    129   enum BufferId {
    130     kFlushBufferId = -2  // Buffer id for flush buffer, queued by FlushTask().
    131   };
    132 
    133   // Auto-destruction reference for BitstreamBuffer, for message-passing from
    134   // Decode() to DecodeTask().
    135   struct BitstreamBufferRef;
    136 
    137   // Auto-destruction reference for EGLSync (for message-passing).
    138   struct EGLSyncKHRRef;
    139 
    140   // Record for decoded pictures that can be sent to PictureReady.
    141   struct PictureRecord;
    142 
    143   // Record for input buffers.
    144   struct InputRecord {
    145     InputRecord();
    146     ~InputRecord();
    147     bool at_device;         // held by device.
    148     void* address;          // mmap() address.
    149     size_t length;          // mmap() length.
    150     off_t bytes_used;       // bytes filled in the mmap() segment.
    151     int32 input_id;         // triggering input_id as given to Decode().
    152   };
    153 
    154   // Record for output buffers.
    155   struct OutputRecord {
    156     OutputRecord();
    157     ~OutputRecord();
    158     bool at_device;         // held by device.
    159     bool at_client;         // held by client.
    160     EGLImageKHR egl_image;  // EGLImageKHR for the output buffer.
    161     EGLSyncKHR egl_sync;    // sync the compositor's use of the EGLImage.
    162     int32 picture_id;       // picture buffer id as returned to PictureReady().
    163     bool cleared;           // Whether the texture is cleared and safe to render
    164                             // from. See TextureManager for details.
    165   };
    166 
    167   //
    168   // Decoding tasks, to be run on decode_thread_.
    169   //
    170 
    171   // Enqueue a BitstreamBuffer to decode.  This will enqueue a buffer to the
    172   // decoder_input_queue_, then queue a DecodeBufferTask() to actually decode
    173   // the buffer.
    174   void DecodeTask(const media::BitstreamBuffer& bitstream_buffer);
    175 
    176   // Decode from the buffers queued in decoder_input_queue_.  Calls
    177   // DecodeBufferInitial() or DecodeBufferContinue() as appropriate.
    178   void DecodeBufferTask();
    179   // Advance to the next fragment that begins a frame.
    180   bool AdvanceFrameFragment(const uint8* data, size_t size, size_t* endpos);
    181   // Schedule another DecodeBufferTask() if we're behind.
    182   void ScheduleDecodeBufferTaskIfNeeded();
    183 
    184   // Return true if we should continue to schedule DecodeBufferTask()s after
    185   // completion.  Store the amount of input actually consumed in |endpos|.
    186   bool DecodeBufferInitial(const void* data, size_t size, size_t* endpos);
    187   bool DecodeBufferContinue(const void* data, size_t size);
    188 
    189   // Accumulate data for the next frame to decode.  May return false in
    190   // non-error conditions; for example when pipeline is full and should be
    191   // retried later.
    192   bool AppendToInputFrame(const void* data, size_t size);
    193   // Flush data for one decoded frame.
    194   bool FlushInputFrame();
    195 
    196   // Service I/O on the V4L2 devices.  This task should only be scheduled from
    197   // DevicePollTask().  If |event_pending| is true, one or more events
    198   // on file descriptor are pending.
    199   void ServiceDeviceTask(bool event_pending);
    200   // Handle the various device queues.
    201   void Enqueue();
    202   void Dequeue();
    203   // Handle incoming events.
    204   void DequeueEvents();
    205   // Enqueue a buffer on the corresponding queue.
    206   bool EnqueueInputRecord();
    207   bool EnqueueOutputRecord();
    208 
    209   // Process a ReusePictureBuffer() API call.  The API call create an EGLSync
    210   // object on the main (GPU process) thread; we will record this object so we
    211   // can wait on it before reusing the buffer.
    212   void ReusePictureBufferTask(int32 picture_buffer_id,
    213                               scoped_ptr<EGLSyncKHRRef> egl_sync_ref);
    214 
    215   // Flush() task.  Child thread should not submit any more buffers until it
    216   // receives the NotifyFlushDone callback.  This task will schedule an empty
    217   // BitstreamBufferRef (with input_id == kFlushBufferId) to perform the flush.
    218   void FlushTask();
    219   // Notify the client of a flush completion, if required.  This should be
    220   // called any time a relevant queue could potentially be emptied: see
    221   // function definition.
    222   void NotifyFlushDoneIfNeeded();
    223 
    224   // Reset() task.  This task will schedule a ResetDoneTask() that will send
    225   // the NotifyResetDone callback, then set the decoder state to kResetting so
    226   // that all intervening tasks will drain.
    227   void ResetTask();
    228   // ResetDoneTask() will set the decoder state back to kAfterReset, so
    229   // subsequent decoding can continue.
    230   void ResetDoneTask();
    231 
    232   // Device destruction task.
    233   void DestroyTask();
    234 
    235   // Attempt to start/stop device_poll_thread_.
    236   bool StartDevicePoll();
    237   // If |keep_input_state| is true, don't reset input state; used during
    238   // resolution change.
    239   bool StopDevicePoll(bool keep_input_state);
    240 
    241   void StartResolutionChangeIfNeeded();
    242   void FinishResolutionChange();
    243 
    244   // Try to get output format, detected after parsing the beginning
    245   // of the stream. Sets |again| to true if more parsing is needed.
    246   bool GetFormatInfo(struct v4l2_format* format, bool* again);
    247   // Create output buffers for the given |format|.
    248   bool CreateBuffersForFormat(const struct v4l2_format& format);
    249 
    250   //
    251   // Device tasks, to be run on device_poll_thread_.
    252   //
    253 
    254   // The device task.
    255   void DevicePollTask(bool poll_device);
    256 
    257   //
    258   // Safe from any thread.
    259   //
    260 
    261   // Error notification (using PostTask() to child thread, if necessary).
    262   void NotifyError(Error error);
    263 
    264   // Set the decoder_thread_ state (using PostTask to decoder thread, if
    265   // necessary).
    266   void SetDecoderState(State state);
    267 
    268   //
    269   // Other utility functions.  Called on decoder_thread_, unless
    270   // decoder_thread_ is not yet started, in which case the child thread can call
    271   // these (e.g. in Initialize() or Destroy()).
    272   //
    273 
    274   // Create the buffers we need.
    275   bool CreateInputBuffers();
    276   bool CreateOutputBuffers();
    277 
    278   //
    279   // Methods run on child thread.
    280   //
    281 
    282   // Destroy buffers.
    283   void DestroyInputBuffers();
    284   // In contrast to DestroyInputBuffers, which is called only from destructor,
    285   // we call DestroyOutputBuffers also during playback, on resolution change.
    286   // Even if anything fails along the way, we still want to go on and clean
    287   // up as much as possible, so return false if this happens, so that the
    288   // caller can error out on resolution change.
    289   bool DestroyOutputBuffers();
    290   void ResolutionChangeDestroyBuffers();
    291 
    292   // Send decoded pictures to PictureReady.
    293   void SendPictureReady();
    294 
    295   // Callback that indicates a picture has been cleared.
    296   void PictureCleared();
    297 
    298   // This method determines whether a resolution change event processing
    299   // is indeed required by returning true iff:
    300   // - width or height of the new format is different than previous format; or
    301   // - V4L2_CID_MIN_BUFFERS_FOR_CAPTURE has changed.
    302   bool IsResolutionChangeNecessary();
    303 
    304   // Our original calling message loop for the child thread.
    305   scoped_refptr<base::MessageLoopProxy> child_message_loop_proxy_;
    306 
    307   // Message loop of the IO thread.
    308   scoped_refptr<base::MessageLoopProxy> io_message_loop_proxy_;
    309 
    310   // WeakPtr<> pointing to |this| for use in posting tasks from the decoder or
    311   // device worker threads back to the child thread.  Because the worker threads
    312   // are members of this class, any task running on those threads is guaranteed
    313   // that this object is still alive.  As a result, tasks posted from the child
    314   // thread to the decoder or device thread should use base::Unretained(this),
    315   // and tasks posted the other way should use |weak_this_|.
    316   base::WeakPtr<V4L2VideoDecodeAccelerator> weak_this_;
    317 
    318   // To expose client callbacks from VideoDecodeAccelerator.
    319   // NOTE: all calls to these objects *MUST* be executed on
    320   // child_message_loop_proxy_.
    321   scoped_ptr<base::WeakPtrFactory<Client> > client_ptr_factory_;
    322   base::WeakPtr<Client> client_;
    323   // Callbacks to |io_client_| must be executed on |io_message_loop_proxy_|.
    324   base::WeakPtr<Client> io_client_;
    325 
    326   //
    327   // Decoder state, owned and operated by decoder_thread_.
    328   // Before decoder_thread_ has started, the decoder state is managed by
    329   // the child (main) thread.  After decoder_thread_ has started, the decoder
    330   // thread should be the only one managing these.
    331   //
    332 
    333   // This thread services tasks posted from the VDA API entry points by the
    334   // child thread and device service callbacks posted from the device thread.
    335   base::Thread decoder_thread_;
    336   // Decoder state machine state.
    337   State decoder_state_;
    338   // BitstreamBuffer we're presently reading.
    339   scoped_ptr<BitstreamBufferRef> decoder_current_bitstream_buffer_;
    340   // The V4L2Device this class is operating upon.
    341   scoped_ptr<V4L2Device> device_;
    342   // FlushTask() and ResetTask() should not affect buffers that have been
    343   // queued afterwards.  For flushing or resetting the pipeline then, we will
    344   // delay these buffers until after the flush or reset completes.
    345   int decoder_delay_bitstream_buffer_id_;
    346   // Input buffer we're presently filling.
    347   int decoder_current_input_buffer_;
    348   // We track the number of buffer decode tasks we have scheduled, since each
    349   // task execution should complete one buffer.  If we fall behind (due to
    350   // resource backpressure, etc.), we'll have to schedule more to catch up.
    351   int decoder_decode_buffer_tasks_scheduled_;
    352   // Picture buffers held by the client.
    353   int decoder_frames_at_client_;
    354   // Are we flushing?
    355   bool decoder_flushing_;
    356   // Got a notification from driver that it reached resolution change point
    357   // in the stream.
    358   bool resolution_change_pending_;
    359   // Got a reset request while we were performing resolution change.
    360   bool resolution_change_reset_pending_;
    361   // Input queue for decoder_thread_: BitstreamBuffers in.
    362   std::queue<linked_ptr<BitstreamBufferRef> > decoder_input_queue_;
    363   // For H264 decode, hardware requires that we send it frame-sized chunks.
    364   // We'll need to parse the stream.
    365   scoped_ptr<media::H264Parser> decoder_h264_parser_;
    366   // Set if the decoder has a pending incomplete frame in an input buffer.
    367   bool decoder_partial_frame_pending_;
    368 
    369   //
    370   // Hardware state and associated queues.  Since decoder_thread_ services
    371   // the hardware, decoder_thread_ owns these too.
    372   // output_buffer_map_, free_output_buffers_ and output_planes_count_ are an
    373   // exception during the buffer (re)allocation sequence, when the
    374   // decoder_thread_ is blocked briefly while the Child thread manipulates
    375   // them.
    376   //
    377 
    378   // Completed decode buffers.
    379   std::queue<int> input_ready_queue_;
    380 
    381   // Input buffer state.
    382   bool input_streamon_;
    383   // Input buffers enqueued to device.
    384   int input_buffer_queued_count_;
    385   // Input buffers ready to use, as a LIFO since we don't care about ordering.
    386   std::vector<int> free_input_buffers_;
    387   // Mapping of int index to input buffer record.
    388   std::vector<InputRecord> input_buffer_map_;
    389 
    390   // Output buffer state.
    391   bool output_streamon_;
    392   // Output buffers enqueued to device.
    393   int output_buffer_queued_count_;
    394   // Output buffers ready to use, as a FIFO since we want oldest-first to hide
    395   // synchronization latency with GL.
    396   std::queue<int> free_output_buffers_;
    397   // Mapping of int index to output buffer record.
    398   std::vector<OutputRecord> output_buffer_map_;
    399   // Required size of DPB for decoding.
    400   int output_dpb_size_;
    401   // Stores the number of planes (i.e. separate memory buffers) for output.
    402   size_t output_planes_count_;
    403 
    404   // Pictures that are ready but not sent to PictureReady yet.
    405   std::queue<PictureRecord> pending_picture_ready_;
    406 
    407   // The number of pictures that are sent to PictureReady and will be cleared.
    408   int picture_clearing_count_;
    409 
    410   // Used by the decoder thread to wait for AssignPictureBuffers to arrive
    411   // to avoid races with potential Reset requests.
    412   base::WaitableEvent pictures_assigned_;
    413 
    414   // Output picture size.
    415   gfx::Size frame_buffer_size_;
    416 
    417   //
    418   // The device polling thread handles notifications of V4L2 device changes.
    419   //
    420 
    421   // The thread.
    422   base::Thread device_poll_thread_;
    423 
    424   //
    425   // Other state, held by the child (main) thread.
    426   //
    427 
    428   // Make our context current before running any EGL entry points.
    429   base::Callback<bool(void)> make_context_current_;
    430 
    431   // EGL state
    432   EGLDisplay egl_display_;
    433   EGLContext egl_context_;
    434 
    435   // The codec we'll be decoding for.
    436   media::VideoCodecProfile video_profile_;
    437 
    438   // The WeakPtrFactory for |weak_this_|.
    439   base::WeakPtrFactory<V4L2VideoDecodeAccelerator> weak_this_factory_;
    440 
    441   DISALLOW_COPY_AND_ASSIGN(V4L2VideoDecodeAccelerator);
    442 };
    443 
    444 }  // namespace content
    445 
    446 #endif  // CONTENT_COMMON_GPU_MEDIA_V4L2_VIDEO_DECODE_ACCELERATOR_H_
    447