/*!
 ************************************************************************
 *  \file
 *     global.h
 *  \brief
 *     global definitions for H.264 decoder.
 *  \author
 *     Copyright (C) 1999  Telenor Satellite Services,Norway
 *                         Ericsson Radio Systems, Sweden
 *
 *     Inge Lille-Langoy               <inge.lille-langoy@telenor.com>
 *
 *     Telenor Satellite Services
 *     Keysers gt.13                       tel.:   +47 23 13 86 98
 *     N-0130 Oslo,Norway                  fax.:   +47 22 77 79 80
 *
 *     Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
 *
 *     Ericsson Radio Systems
 *     KI/ERA/T/VV
 *     164 80 Stockholm, Sweden
 *
 ************************************************************************
 */
#ifndef _GLOBAL_H_
#define _GLOBAL_H_

#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <sys/timeb.h>

#include <bfc/platform/types.h>
#include "win32.h"
#include "defines.h"
#include "ifunctions.h"
#include "parsetcommon.h"
#include "types.h"
#include "frame.h"
#include "nalucommon.h"
#include "memcache.h"
#include <mmintrin.h>
#ifdef H264_IPP
//#include "../tools/staticlib/ipp_px.h"
#include "ippdefs.h"
#include "ippcore.h"
#include "ipps.h"
#include "ippi.h"
#include "ippvc.h"
#endif
/* benski> not the best place for this but it works for now */
#ifdef _M_IX86
// must be a multiple of 16
#pragma warning(disable: 4799)
static inline void memzero_cache32(void *dst, unsigned long i)
{
	
	__asm {
		pxor	mm0, mm0
		mov edi, dst

loopwrite:

		movq 0[edi], mm0
		movq 8[edi], mm0
		movq 16[edi], mm0
		movq 24[edi], mm0
		lea edi, [edi+32]
		sub i, 32
		jg loopwrite

	}
}

static inline void memzero_fast32(void *dst, unsigned long i)
{
	
	__asm {
		pxor	mm0, mm0
		mov edi, dst

loopwrite:

		movntq 0[edi], mm0
		movntq 8[edi], mm0
		movntq 16[edi], mm0
		movntq 24[edi], mm0

		lea edi, [edi+32]
		sub i, 32
		jg loopwrite

	}
}

static inline void memzero64(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
		movq 8[edi], mm0
		movq 16[edi], mm0
		movq 24[edi], mm0
		movq 32[edi], mm0
		movq 40[edi], mm0
		movq 48[edi], mm0
		movq 56[edi], mm0
	}
}

static inline void memzero128(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
		movq 8[edi], mm0
		movq 16[edi], mm0
		movq 24[edi], mm0
		movq 32[edi], mm0
		movq 40[edi], mm0
		movq 48[edi], mm0
		movq 56[edi], mm0
		movq 64[edi], mm0
		movq 72[edi], mm0
		movq 80[edi], mm0
		movq 88[edi], mm0
		movq 96[edi], mm0
		movq 104[edi], mm0
		movq 112[edi], mm0
		movq 120[edi], mm0
	}
}

static inline void memzero24(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
		movq 8[edi], mm0
		movq 16[edi], mm0
	}
}

static inline void memzero48(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
		movq 8[edi], mm0
		movq 16[edi], mm0
		movq 24[edi], mm0
		movq 32[edi], mm0
		movq 40[edi], mm0
	}
}

static inline void memzero16(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
		movq 8[edi], mm0
	}
}

static inline void memzero8(void *dst)
{
	__asm {
		pxor	mm0, mm0
		mov edi, dst

		movq 0[edi], mm0
	}
}

static inline void memset_fast_end()
{
	_mm_empty();
}

// Very optimized memcpy() routine for all AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetchnta instructions,
// be sure youre running on Athlon/Duron or other recent CPU before calling!

#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".

#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization.   This code uses
// the software prefetch instruction to get the data into the cache.

#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, its faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"

#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch.  The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.

// Inline assembly syntax for use with Visual C++


static void * memcpy_amd(void *dest, const void *src, size_t n)
{
  __asm {

	mov		ecx, [n]		// number of bytes to copy
	mov		edi, [dest]		// destination
	mov		esi, [src]		// source
	mov		ebx, ecx		// keep a copy of count

	cld
	cmp		ecx, TINY_BLOCK_COPY
	jb		$memcpy_ic_3	// tiny? skip mmx copy

	cmp		ecx, 32*1024		// dont align between 32k-64k because
	jbe		$memcpy_do_align	//  it appears to be slower
	cmp		ecx, 64*1024
	jbe		$memcpy_align_done
$memcpy_do_align:
	mov		ecx, 8			// a trick thats faster than rep movsb...
	sub		ecx, edi		// align destination to qword
	and		ecx, 111b		// get the low bits
	sub		ebx, ecx		// update copy count
	neg		ecx				// set up to jump into the array
	add		ecx, offset $memcpy_align_done
	jmp		ecx				// jump to array of movsbs

align 4
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb

$memcpy_align_done:			// destination is dword aligned
	mov		ecx, ebx		// number of bytes left to copy
	shr		ecx, 6			// get 64-byte block count
	jz		$memcpy_ic_2	// finish the last few bytes

	cmp		ecx, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
	jae		$memcpy_uc_test

// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time.  It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1:			// 64-byte block copies, in-cache copy

	prefetchnta [esi + (200*64/34+192)]		// start reading ahead

	movq	mm0, [esi+0]	// read 64 bits
	movq	mm1, [esi+8]
	movq	[edi+0], mm0	// write 64 bits
	movq	[edi+8], mm1	//    note:  the normal movq writes the
	movq	mm2, [esi+16]	//    data to cache// a cache line will be
	movq	mm3, [esi+24]	//    allocated as needed, to store the data
	movq	[edi+16], mm2
	movq	[edi+24], mm3
	movq	mm0, [esi+32]
	movq	mm1, [esi+40]
	movq	[edi+32], mm0
	movq	[edi+40], mm1
	movq	mm2, [esi+48]
	movq	mm3, [esi+56]
	movq	[edi+48], mm2
	movq	[edi+56], mm3

	add		esi, 64			// update source pointer
	add		edi, 64			// update destination pointer
	dec		ecx				// count down
	jnz		$memcpy_ic_1	// last 64-byte block?

$memcpy_ic_2:
	mov		ecx, ebx		// has valid low 6 bits of the byte count
$memcpy_ic_3:
	shr		ecx, 2			// dword count
	and		ecx, 1111b		// only look at the "remainder" bits
	neg		ecx				// set up to jump into the array
	add		ecx, offset $memcpy_last_few
	jmp		ecx				// jump to array of movsds

$memcpy_uc_test:
	cmp		ecx, UNCACHED_COPY/64	// big enough? use block prefetch copy
	jae		$memcpy_bp_1

$memcpy_64_test:
	or		ecx, ecx		// _tail end of block prefetch will jump here
	jz		$memcpy_ic_2	// no more 64-byte blocks left

// For larger blocks, which will spill beyond the cache, its faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1:				// 64-byte blocks, uncached copy

	prefetchnta [esi + (200*64/34+192)]		// start reading ahead

	movq	mm0,[esi+0]		// read 64 bits
	add		edi,64			// update destination pointer
	movq	mm1,[esi+8]
	add		esi,64			// update source pointer
	movq	mm2,[esi-48]
	movntq	[edi-64], mm0	// write 64 bits, bypassing the cache
	movq	mm0,[esi-40]	//    note: movntq also prevents the CPU
	movntq	[edi-56], mm1	//    from READING the destination address
	movq	mm1,[esi-32]	//    into the cache, only to be over-written
	movntq	[edi-48], mm2	//    so that also helps performance
	movq	mm2,[esi-24]
	movntq	[edi-40], mm0
	movq	mm0,[esi-16]
	movntq	[edi-32], mm1
	movq	mm1,[esi-8]
	movntq	[edi-24], mm2
	movntq	[edi-16], mm0
	dec		ecx
	movntq	[edi-8], mm1
	jnz		$memcpy_uc_1	// last 64-byte block?

	jmp		$memcpy_ic_2		// almost done

// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1:			// large blocks, block prefetch copy

	cmp		ecx, CACHEBLOCK			// big enough to run another prefetch loop?
	jl		$memcpy_64_test			// no, back to regular uncached copy

	mov		eax, CACHEBLOCK / 2		// block prefetch loop, unrolled 2X
	add		esi, CACHEBLOCK * 64	// move to the top of the block
align 16
$memcpy_bp_2:
	mov		edx, [esi-64]		// grab one address per cache line
	mov		edx, [esi-128]		// grab one address per cache line
	sub		esi, 128			// go reverse order
	dec		eax					// count down the cache lines
	jnz		$memcpy_bp_2		// keep grabbing more lines into cache

	mov		eax, CACHEBLOCK		// now that its in cache, do the copy
align 16
$memcpy_bp_3:
	movq	mm0, [esi   ]		// read 64 bits
	movq	mm1, [esi+ 8]
	movq	mm2, [esi+16]
	movq	mm3, [esi+24]
	movq	mm4, [esi+32]
	movq	mm5, [esi+40]
	movq	mm6, [esi+48]
	movq	mm7, [esi+56]
	add		esi, 64				// update source pointer
	movntq	[edi   ], mm0		// write 64 bits, bypassing cache
	movntq	[edi+ 8], mm1		//    note: movntq also prevents the CPU
	movntq	[edi+16], mm2		//    from READING the destination address 
	movntq	[edi+24], mm3		//    into the cache, only to be over-written,
	movntq	[edi+32], mm4		//    so that also helps performance
	movntq	[edi+40], mm5
	movntq	[edi+48], mm6
	movntq	[edi+56], mm7
	add		edi, 64				// update dest pointer

	dec		eax					// count down

	jnz		$memcpy_bp_3		// keep copying
	sub		ecx, CACHEBLOCK		// update the 64-byte block count
	jmp		$memcpy_bp_1		// keep processing chunks

// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".   Then it handles the last few bytes.
align 4
	movsd
	movsd			// perform last 1-15 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd			// perform last 1-7 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd

$memcpy_last_few:		// dword aligned from before movsds
	mov		ecx, ebx	// has valid low 2 bits of the byte count
	and		ecx, 11b	// the last few cows must come home
	jz		$memcpy_final	// no more, lets leave
	rep		movsb		// the last 1, 2, or 3 bytes

$memcpy_final: 
//	emms				// clean up the MMX state
	sfence				// flush the write buffer
	mov		eax, [dest]	// ret value = destination pointer

    }
}

#elif defined(_M_X64)
static inline void memzero24(void *dst)
{
	int32_t j;
	int32_t *d = (int32_t *)dst;
	for (j=0;j<24;j+=4)
	{
		d[j] = 0;
	}
}
static inline void memset_fast_end() {}
#else
static inline void memzero_fast16(void *dst,  unsigned long i)
{
	int32_t j;
	int32_t *d = (int32_t *)dst;
	for (j=0;j<i;j+=4)
	{
		d[j] = 0;
	}
}
static inline void memzero24(void *dst)
{
	int32_t j;
	int32_t *d = (int32_t *)dst;
	for (j=0;j<24;j+=4)
	{
		d[j] = 0;
	}
}
static inline void memset_fast_end() {}
#endif

#define UNDEFINED_REFERENCE ((int)0x80000000)
typedef int32_t h264_ref_t;

#define ET_SIZE 300      //!< size of error text buffer
extern char errortext[ET_SIZE]; //!< buffer for error message for exit with error()
extern int sse2_flag, mmx_flag, sse_flag, sse3_flag, sse4_1_flag;
/***********************************************************************
 * T y p e    d e f i n i t i o n s    f o r    J M
 ***********************************************************************
 */

typedef enum
{
  LumaComp = 0,
  CrComp = 1,
  CbComp = 2
} Color_Component;

/***********************************************************************
 * D a t a    t y p e s   f o r  C A B A C
 ***********************************************************************
 */

typedef struct pix_pos
{
  int   available;
  int   mb_addr;
  short x;
  short y;
  short pos_x;
  short pos_y;
} PixelPos;

//! struct to characterize the state of the arithmetic coding engine
typedef struct
{
  unsigned int    Drange;
  unsigned int    Dvalue;
  int             DbitsLeft;
  byte            *Dcodestrm;
  int             *Dcodestrm_len;
} DecodingEnvironment;

typedef DecodingEnvironment *DecodingEnvironmentPtr;

typedef short MotionVector[2];

//! definition of motion parameters
typedef struct pic_motion
{
	h264_ref_t ref_pic_id;
	h264_ref_t ref_id;
	MotionVector mv;
	char ref_idx;
} PicMotion;

// TODO: benski> might be more efficient to make a [list][subblock_y][subblock_x] array of these values instead of parallel arrays
typedef struct motion_params
{
	PicMotion **motion[2];
  byte **     moving_block;
} MotionParams;

//! struct for context management
typedef struct
{
  uint16_t state;         // index into state-table CP
  unsigned char  MPS;           // Least Probable Symbol 0/1 CP
  unsigned char dummy;          // for alignment
} BiContextType;

typedef BiContextType *BiContextTypePtr;


/**********************************************************************
 * C O N T E X T S   F O R   T M L   S Y N T A X   E L E M E N T S
 **********************************************************************
 */

#define NUM_MB_TYPE_CTX  11
#define NUM_B8_TYPE_CTX  9
#define NUM_MV_RES_CTX   10
#define NUM_REF_NO_CTX   6
#define NUM_DELTA_QP_CTX 4
#define NUM_MB_AFF_CTX 4
#define NUM_TRANSFORM_SIZE_CTX 3

// structures that will be declared somewhere else
struct storable_picture;
struct datapartition;
struct syntaxelement;

typedef struct
{
  BiContextType mb_type_contexts [3][NUM_MB_TYPE_CTX];
  BiContextType b8_type_contexts [2][NUM_B8_TYPE_CTX];
  BiContextType mv_res_contexts  [2][NUM_MV_RES_CTX];
  BiContextType ref_no_contexts  [2][NUM_REF_NO_CTX];
  BiContextType delta_qp_contexts[NUM_DELTA_QP_CTX];
  BiContextType mb_aff_contexts  [NUM_MB_AFF_CTX];
} MotionInfoContexts;

#define NUM_IPR_CTX    2
#define NUM_CIPR_CTX   4
#define NUM_CBP_CTX    4
#define NUM_BCBP_CTX   4
#define NUM_MAP_CTX   15
#define NUM_LAST_CTX  15
#define NUM_ONE_CTX    5
#define NUM_ABS_CTX    5


typedef struct
{
  BiContextType  transform_size_contexts [NUM_TRANSFORM_SIZE_CTX];
  BiContextType  ipr_contexts [NUM_IPR_CTX];
  BiContextType  cipr_contexts[NUM_CIPR_CTX];
  BiContextType  cbp_contexts [3][NUM_CBP_CTX];
  BiContextType  bcbp_contexts[NUM_BLOCK_TYPES][NUM_BCBP_CTX];
  BiContextType  map_contexts [2][NUM_BLOCK_TYPES][NUM_MAP_CTX+1]; // +1 for better alignment
  BiContextType  last_contexts[2][NUM_BLOCK_TYPES][NUM_LAST_CTX+1]; // +1 for better alignment
  BiContextType  one_contexts [NUM_BLOCK_TYPES][NUM_ONE_CTX];
  BiContextType  abs_contexts [NUM_BLOCK_TYPES][NUM_ABS_CTX];
} TextureInfoContexts;


//*********************** end of data type definition for CABAC *******************

/***********************************************************************
 * N e w   D a t a    t y p e s   f o r    T M L
 ***********************************************************************
 */

/*! Buffer structure for decoded reference picture marking commands */
typedef struct DecRefPicMarking_s
{
  int memory_management_control_operation;
  int difference_of_pic_nums_minus1;
  int long_term_pic_num;
  int long_term_frame_idx;
  int max_long_term_frame_idx_plus1;
  struct DecRefPicMarking_s *Next;
} DecRefPicMarking_t;


//! definition of pic motion parameters
typedef struct pic_motion_params2
{
  h264_ref_t    ref_pic_id;    //!< reference picture identifier [list][subblock_y][subblock_x]
  h264_ref_t    ref_id;        //!< reference picture identifier [list][subblock_y][subblock_x]
  short    mv[2];         //!< motion vector       [list][subblock_x][subblock_y][component]
  char     ref_idx;       //!< reference picture   [list][subblock_y][subblock_x]
  byte     mb_field;      //!< field macroblock indicator
  byte     field_frame;   //!< indicates if co_located is field or frame.
} PicMotionParams2;

//! Macroblock
typedef struct macroblock
{
  struct slice       *p_Slice;                    //!< pointer to the current slice
  struct img_par     *p_Vid;                      //!< pointer to VideoParameters
  struct inp_par     *p_Inp;
  int                 mbAddrX;                    //!< current MB address
  int mb_x;
  int mb_y;
  int block_x;
  int block_y;
  int block_y_aff;
  int pix_x;
  int pix_y;
  int pix_c_x;
  int pix_c_y;

  int subblock_x;
  int subblock_y;

  int           qp;                    //!< QP luma
  int           qpc[2];                //!< QP chroma
  int           qp_scaled[MAX_PLANE];  //!< QP scaled for all comps.
  Boolean       is_lossless;
  Boolean       is_intra_block;
  Boolean       is_v_block;

  short         slice_nr;
  short         delta_quant;          //!< for rate control

  struct macroblock   *mb_up;   //!< pointer to neighboring MB (CABAC)
  struct macroblock   *mb_left; //!< pointer to neighboring MB (CABAC)

  // some storage of macroblock syntax elements for global access
  int           mb_type;
  short         mvd[2][BLOCK_MULTIPLE][BLOCK_MULTIPLE][2];      //!< indices correspond to [forw,backw][block_y][block_x][x,y]
  int           cbp;
  int64         cbp_blk     [3];
  int64         cbp_bits    [3];
  int64         cbp_bits_8x8[3];

  int           i16mode;
  char          b8mode[4];
  char          b8pdir[4];
  char          ei_flag;             //!< error indicator flag that enables concealment
  char          dpl_flag;            //!< error indicator flag that signals a missing data partition
  char          ipmode_DPCM;

  short         DFDisableIdc;
  short         DFAlphaC0Offset;
  short         DFBetaOffset;

  char          c_ipred_mode;       //!< chroma intra prediction mode
  Boolean       mb_field;

  int          skip_flag;

  int mb_addr_left, mb_addr_up, mb_addr_upper_right, mb_addr_upper_left;
  Boolean mb_avail_left, mb_avail_up, mb_avail_upper_right, mb_avail_upper_left;

  Boolean       luma_transform_size_8x8_flag;
  Boolean       NoMbPartLessThan8x8Flag;

  void (*itrans_8x8)(struct macroblock *currMB, ColorPlane pl, int ioff, int joff);

  void (*GetMVPredictor) (struct macroblock *currMB, PixelPos *block, 
    short pmv[2], short ref_frame, struct pic_motion **motion, int mb_x, int mb_y, int blockshape_x, int blockshape_y);

  int (*read_and_store_CBP_block_bit) (struct macroblock *currMB, DecodingEnvironmentPtr  dep_dp, int type);
  char (*readRefPictureIdx)   (struct syntaxelement *currSE, struct datapartition *dP, int list);

} Macroblock;

//! Syntaxelement
typedef struct syntaxelement
{
  int           value1;                //!< numerical value of syntax element
  int           value2;                //!< for blocked symbols, e.g. run/level
  int           len;                   //!< length of code
  //int           inf;                   //!< info part of CAVLC code

#if TRACE
  #define       TRACESTRING_SIZE 100           //!< size of trace string
  char          tracestring[TRACESTRING_SIZE]; //!< trace string
#endif

  //! for mapping of CAVLC to syntaxElement
  void  (*mapping)(int len, int info, int *value1, int *value2);
} SyntaxElement;


//! Bitstream
typedef struct
{
  // CABAC Decoding
  int           read_len;           //!< actual position in the codebuffer, CABAC only
  int           code_len;           //!< overall codebuffer length, CABAC only
  // CAVLC Decoding
  int           frame_bitoffset;    //!< actual position in the codebuffer, bit-oriented, CAVLC only
  int           bitstream_length;   //!< over codebuffer lnegth, byte oriented, CAVLC only

  byte          *streamBuffer;      //!< actual codebuffer for read bytes
} Bitstream;


/* === 4x4 block typedefs === */
// 32 bit precision
typedef int h264_int_block_row_t[BLOCK_SIZE];
typedef h264_int_block_row_t h264_int_block_t[BLOCK_SIZE];
// 16 bit precision
typedef int16_t h264_short_block_row_t[BLOCK_SIZE];
typedef h264_short_block_row_t h264_short_block_t[BLOCK_SIZE];
// 8 bit precision

/* === 8x8 block typedefs === */
// 32 bit precision
typedef int h264_int_8x8block_row_t[BLOCK_SIZE_8x8];
typedef h264_int_8x8block_row_t h264_int_8x8block_t[BLOCK_SIZE_8x8];
// 16 bit precision
typedef int16_t h264_short_8x8block_row_t[BLOCK_SIZE_8x8];
typedef h264_short_8x8block_row_t h264_short_8x8block_t[BLOCK_SIZE_8x8];
// 8 bit precision
typedef imgpel h264_imgpel_8x8block_row_t[BLOCK_SIZE_8x8];
typedef h264_imgpel_8x8block_row_t h264_imgpel_8x8block_t[BLOCK_SIZE_8x8];

/* === 16x16 block typedefs === */
// 32 bit precision
typedef int h264_int_macroblock_row_t[MB_BLOCK_SIZE];
typedef h264_int_macroblock_row_t h264_int_macroblock_t[MB_BLOCK_SIZE];
// 16 bit precision
typedef int16_t h264_short_macroblock_row_t[MB_BLOCK_SIZE];
typedef h264_short_macroblock_row_t h264_short_macroblock_t[MB_BLOCK_SIZE];
// 8 bit precision
typedef imgpel h264_imgpel_macroblock_row_t[MB_BLOCK_SIZE];
typedef h264_imgpel_macroblock_row_t h264_imgpel_macroblock_t[MB_BLOCK_SIZE];




typedef int h264_pic_position[2];
typedef byte h264_4x4_byte[BLOCK_SIZE][BLOCK_SIZE];
typedef h264_4x4_byte h264_nz_coefficient[3];

//! DataPartition
typedef struct datapartition
{

  Bitstream           *bitstream;
  DecodingEnvironment de_cabac;

} DataPartition;

//! Slice
typedef struct slice
{
  struct img_par      *p_Vid;
  struct inp_par      *p_Inp;
  pic_parameter_set_rbsp_t *active_pps;
  seq_parameter_set_rbsp_t *active_sps;

  struct colocated_params *p_colocated;
  struct colocated_params *Co_located_JV[MAX_PLANE];  //!< p_colocated to be used during 4:4:4 independent mode decoding

  int                 mb_aff_frame_flag;
  int                 direct_spatial_mv_pred_flag;       //!< Indicator for direct mode type (1 for Spatial, 0 for Temporal)
  int                 num_ref_idx_l0_active;             //!< number of available list 0 references
  int                 num_ref_idx_l1_active;             //!< number of available list 1 references

  int                 qp;
  int                 slice_qp_delta;
  int                 qs;
  int                 slice_qs_delta;
  int                 slice_type;    //!< slice type
  int                 model_number;  //!< cabac model number
  PictureStructure    structure;     //!< Identify picture structure type
  int                 start_mb_nr;   //!< MUST be set by NAL even in case of ei_flag == 1
  int                 max_part_nr;
  int                 dp_mode;       //!< data partitioning mode
  int                 last_dquant;

  //  int                 last_mb_nr;    //!< only valid when entropy coding == CABAC
  DataPartition       *partArr;      //!< array of partitions
  MotionInfoContexts  *mot_ctx;      //!< pointer to struct of context models for use in CABAC
  TextureInfoContexts *tex_ctx;      //!< pointer to struct of context models for use in CABAC

  int mvscale[6][MAX_REFERENCE_PICTURES];

  int                 ref_pic_list_reordering_flag_l0;
  int                 *reordering_of_pic_nums_idc_l0;
  int                 *abs_diff_pic_num_minus1_l0;
  int                 *long_term_pic_idx_l0;
  int                 ref_pic_list_reordering_flag_l1;
  int                 *reordering_of_pic_nums_idc_l1;
  int                 *abs_diff_pic_num_minus1_l1;
  int                 *long_term_pic_idx_l1;
  

  short               DFDisableIdc;     //!< Disable deblocking filter on slice
  short               DFAlphaC0Offset;  //!< Alpha and C0 offset for filtering slice
  short               DFBetaOffset;     //!< Beta offset for filtering slice

  int                 pic_parameter_set_id;   //!<the ID of the picture parameter set the slice is reffering to

  int                 dpB_NotPresent;    //!< non-zero, if data partition B is lost
  int                 dpC_NotPresent;    //!< non-zero, if data partition C is lost


	__declspec(align(32)) h264_imgpel_macroblock_t mb_pred[MAX_PLANE];
	__declspec(align(32)) h264_imgpel_macroblock_t mb_rec[MAX_PLANE];
	__declspec(align(32)) union
	{
		__declspec(align(32)) h264_short_8x8block_t mb_rres8[MAX_PLANE][4];
		__declspec(align(32)) h264_short_macroblock_t cof[MAX_PLANE];
		__declspec(align(32)) h264_short_block_t cof4[MAX_PLANE][16]; // TODO: get this to work, one of these days
		__declspec(align(32)) h264_short_macroblock_t ipcm[MAX_PLANE];
	};

  int cofu[16];

  // Scaling matrix info
  int  InvLevelScale4x4_Intra[3][6][4][4];
  int  InvLevelScale4x4_Inter[3][6][4][4];
  int  InvLevelScale8x8_Intra[3][6][64];
  int  InvLevelScale8x8_Inter[3][6][64];

  int  *qmatrix[12];

  // Cabac
	// TODO: we could optimize coefficient reading by storing the levels/runs instead of coefficients
  int16_t  coeff[64]; // one more for EOB
  int  coeff_ctr;
  int  pos;

  //weighted prediction
  unsigned int apply_weights;
  unsigned int luma_log2_weight_denom;
  unsigned int chroma_log2_weight_denom;
  int wp_weight[2][MAX_REFERENCE_PICTURES][3];  // weight in [list][index][component] order
  int wp_offset[6][MAX_REFERENCE_PICTURES][3];  // offset in [list][index][component] order
  int wbp_weight[6][MAX_REFERENCE_PICTURES][MAX_REFERENCE_PICTURES][3]; //weight in [list][fw_index][bw_index][component] order
  int wp_round_luma;
  int wp_round_chroma;

  void (*read_CBP_and_coeffs_from_NAL) (Macroblock *currMB);
  int  (*decode_one_component     ) (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, struct storable_picture *dec_picture);
  int  (*readSlice                ) (struct img_par *, struct inp_par *);  
  int  (*nal_startcode_follows    ) (struct slice*, int );
  void (*read_motion_info_from_NAL) (Macroblock *currMB);
  void (*read_one_macroblock      ) (Macroblock *currMB);
  void (*interpret_mb_mode        ) (Macroblock *currMB);
	void (*compute_colocated        ) (struct slice *currSlice, struct colocated_params *p, struct storable_picture **listX[6]);

  void (*linfo_cbp_intra) (int len,int info,int *cbp, int *dummy);
  void (*linfo_cbp_inter) (int len,int info,int *cbp, int *dummy);
} Slice;

//****************************** ~DM ***********************************

// image parameters
typedef struct img_par
{
  struct inp_par      *p_Inp;
  pic_parameter_set_rbsp_t *active_pps;
  seq_parameter_set_rbsp_t *active_sps;
  seq_parameter_set_rbsp_t SeqParSet[MAXSPS];
  pic_parameter_set_rbsp_t PicParSet[MAXPPS];

  struct sei_params        *p_SEI;

  struct old_slice_par *old_slice;
  int number;                                 //!< frame number
  unsigned int current_mb_nr; // bitstream order
  unsigned int num_dec_mb;
  short        current_slice_nr;
  int *intra_block;
  
  int qp;                                     //!< quant for the current frame

  int sp_switch;                              //!< 1 for switching sp, 0 for normal sp  
  int type;                                   //!< image type INTER/INTRA
  int width;
  int height;
  int width_cr;                               //!< width chroma  
  int height_cr;                              //!< height chroma
  int mb_x;
  int mb_y;
  int block_x;
  int block_y;
  int pix_c_x;
  int pix_c_y;

  int allrefzero;

  byte **ipredmode;                  //!< prediction type [90][74]
  h264_nz_coefficient *nz_coeff;
  int **siblock;
  int cod_counter;                   //!< Current count of number of skipped macroblocks in a row

  int structure;                     //!< Identify picture structure type

  Slice      *currentSlice;          //!< pointer to current Slice data struct
  Macroblock *mb_data;               //!< array containing all MBs of a whole frame
  Macroblock *mb_data_JV[MAX_PLANE]; //!< mb_data to be used for 4:4:4 independent mode
  int colour_plane_id;               //!< colour_plane_id of the current coded slice
  int ChromaArrayType;

  // For MB level frame/field coding
  int mb_aff_frame_flag;

  // for signalling to the neighbour logic that this is a deblocker call
  int DeblockCall;
  byte mixedModeEdgeFlag;

  // picture error concealment
  // concealment_head points to first node in list, concealment_end points to
  // last node in list. Initialize both to NULL, meaning no nodes in list yet
  struct concealment_node *concealment_head;
  struct concealment_node *concealment_end;

  DecRefPicMarking_t *dec_ref_pic_marking_buffer;                    //!< stores the memory management control operations

  int num_ref_idx_l0_active;             //!< number of forward reference
  int num_ref_idx_l1_active;             //!< number of backward reference

  int slice_group_change_cycle;

  int redundant_pic_cnt;

  unsigned int pre_frame_num;           //!< store the frame_num in the last decoded slice. For detecting gap in frame_num.
  int non_conforming_stream;

  // End JVT-D101
  // POC200301: from unsigned int to int
  int toppoc;      //poc for this top field // POC200301
  int bottompoc;   //poc of bottom field of frame
  int framepoc;    //poc of this frame // POC200301
  unsigned int frame_num;   //frame_num for this frame
  unsigned int field_pic_flag;
  byte         bottom_field_flag;

  //the following is for slice header syntax elements of poc
  // for poc mode 0.
  unsigned int pic_order_cnt_lsb;
  int delta_pic_order_cnt_bottom;
  // for poc mode 1.
  int delta_pic_order_cnt[3];

  // ////////////////////////
  // for POC mode 0:
  signed   int PrevPicOrderCntMsb;
  unsigned int PrevPicOrderCntLsb;
  signed   int PicOrderCntMsb;

  // for POC mode 1:
  unsigned int AbsFrameNum;
  signed int ExpectedPicOrderCnt, PicOrderCntCycleCnt, FrameNumInPicOrderCntCycle;
  unsigned int PreviousFrameNum, FrameNumOffset;
  int ExpectedDeltaPerPicOrderCntCycle;
  int PreviousPOC, ThisPOC;
  int PreviousFrameNumOffset;
  // /////////////////////////

  int idr_flag;
  int nal_reference_idc;                       //!< nal_reference_idc from NAL unit

  int idr_pic_id;

  int MaxFrameNum;

  unsigned int PicWidthInMbs;
  unsigned int PicHeightInMapUnits;
  unsigned int FrameHeightInMbs;
  unsigned int PicHeightInMbs;
  unsigned int PicSizeInMbs;
  unsigned int FrameSizeInMbs;
  unsigned int oldFrameSizeInMbs;

  int no_output_of_prior_pics_flag;
  int long_term_reference_flag;
  int adaptive_ref_pic_buffering_flag;

  int last_has_mmco_5;
  int last_pic_bottom_field;

  // Fidelity Range Extensions Stuff
  short bitdepth_luma;
  short bitdepth_chroma;
  int bitdepth_scale[2];
  int bitdepth_luma_qp_scale;
  int bitdepth_chroma_qp_scale;
  unsigned int dc_pred_value_comp[MAX_PLANE]; //!< component value for DC prediction (depends on component pel bit depth)
  int max_pel_value_comp[MAX_PLANE];       //!< max value that one picture element (pixel) can take (depends on pic_unit_bitdepth)
  int Transform8x8Mode;
  int profile_idc;
  int yuv_format;
  int lossless_qpprime_flag;
  int num_blk8x8_uv;
  int num_uv_blocks;
  int num_cdc_coeff;
  int mb_cr_size_x;
  int mb_cr_size_y;
  int mb_cr_size_x_blk;
  int mb_cr_size_y_blk;
  int mb_size[3][2];                         //!< component macroblock dimensions
  int mb_size_blk[3][2];                     //!< component macroblock dimensions 
  int mb_size_shift[3][2];
  int subpel_x;
  int subpel_y;
  int shiftpel_x;
  int shiftpel_y;

  int max_vmv_r;                             //!< maximum vertical motion vector range in luma quarter frame pixel units for the current level_idc
  int max_mb_vmv_r;                          //!< maximum vertical motion vector range in luma quarter pixel units for the current level_idc

  // picture error concealment
  int last_ref_pic_poc;
  int ref_poc_gap;
  int poc_gap;
  int earlier_missing_poc;
  unsigned int frame_to_conceal;
  int IDR_concealment_flag;
  int conceal_slice_type;

  // random access point decoding
  int recovery_point;
  int recovery_point_found;
  int recovery_frame_cnt;
  int recovery_frame_num;
  int recovery_poc;

  int separate_colour_plane_flag;

  int frame_number;
  int init_bl_done;

  // Redundant slices. Should be moved to another structure and allocated only if extended profile
  unsigned int previous_frame_num; //!< frame number of previous slice
  int ref_flag[17];                //!< 0: i-th previous frame is incorrect
  //!< non-zero: i-th previous frame is correct
  int Is_primary_correct;          //!< if primary frame is correct, 0: incorrect
  int Is_redundant_correct;        //!< if redundant frame is correct, 0:incorrect
  int redundant_slice_ref_idx;     //!< reference index of redundant slice

  //FILE *p_log;                     //!< SNR file
  int LastAccessUnitExists;
  int NALUCount;

  Boolean global_init_done;

  int *qp_per_matrix;
  int *qp_rem_matrix;

  struct frame_store *last_out_fs;
  int pocs_in_dpb[100];


  struct storable_picture *dec_picture;
  struct storable_picture *dec_picture_JV[MAX_PLANE];  //!< dec_picture to be used during 4:4:4 independent mode decoding
  struct storable_picture *no_reference_picture; //!< dummy storable picture for recovery point
  struct storable_picture **listX[6];

  // Error parameters
  struct object_buffer  *erc_object_list;
  struct ercVariables_s *erc_errorVar;

  int erc_mvperMB;
  struct img_par *erc_img;
  int ec_flag[SE_MAX_ELEMENTS];        //!< array to set errorconcealment

	struct memory_input_struct *mem_input;

  struct frame_store *out_buffer;

  struct storable_picture *pending_output;
  int    pending_output_state;
  int    recovery_flag;

  // dpb
  struct decoded_picture_buffer *p_Dpb;

  char listXsize[6];
  // report
  char cslice_type[9];  
  // FMO
  int *MbToSliceGroupMap;
  int *MapUnitToSliceGroupMap;
  int  NumberOfSliceGroups;    // the number of slice groups -1 (0 == scan order, 7 == maximum)

#if (ENABLE_OUTPUT_TONEMAPPING)
  struct tone_mapping_struct_s *seiToneMapping;
#endif

	// benski> buffer of storablge pictures ready for output.
	// might be able to optimize a tad by making a ringbuffer, but i doubt it matters
	struct storable_picture **out_pictures;
	size_t size_out_pictures;
	size_t num_out_pictures;

	ImageCache image_cache[2]; // [0] is luma [1] is chroma (shared for both planes)
	MotionCache motion_cache;

	h264_pic_position *PicPos; //! Helper array to access macroblock positions. 

	NALU_t *nalu; // a cache so we don't re-alloc every time

  void (*getNeighbour)        (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
	void (*getNeighbourPX_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
	void (*getNeighbourXP_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
	void (*getNeighbourLuma)    (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourPXLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourXPLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourLeftLuma)(const Macroblock *currMB, PixelPos *pix);
	void (*getNeighbourNXLuma)  (const Macroblock *currMB, int yN, PixelPos *pix); // xN<0, yN full range
	void (*getNeighbourLeft)    (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN<0, yN=0
	void (*getNeighbourUp)      (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN=0, yN<0
	void (*getNeighbourNX)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN full range
	void (*getNeighbourNP)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
	void (*getNeighbourNPChromaNB)(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
	void (*getNeighbour0X)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN=0, yN full range
	void (*getNeighbour0XLuma)      (const Macroblock *currMB, int yN, PixelPos *pix); // xN=0, yN full range
	void (*getNeighbourX0)      (const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix); // xN full range, yN = 0
	void (*getNeighbourUpLuma)  (const Macroblock *currMB, PixelPos *pix); // xN=0, yN<0
	void (*getNeighbourNPLumaNB)(const Macroblock *currMB, int yN, PixelPos *pix);
	void (*getNeighbourPXLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourPXLumaNB_NoPos)(const Macroblock *currMB, int yN, PixelPos *pix);
	void (*getNeighbourPPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourXPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
	void (*getNeighbourXPLumaNB_NoPos)(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
  void (*get_mb_block_pos) (const h264_pic_position *PicPos, int mb_addr, short *x, short *y);
  void (*GetStrength)      (byte Strength[16], Macroblock *MbQ, int dir,int edge, int mvlimit, struct storable_picture *p);
  void (*EdgeLoopLuma)     (ColorPlane pl, struct video_image *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, struct storable_picture *p);
  void (*EdgeLoopChroma)   (struct video_image *image, byte Strength[16], Macroblock *MbQ, int dir, int edge, int uv, struct storable_picture *p);
} VideoParameters;

// input parameters from configuration file
typedef struct inp_par
{
  int intra_profile_deblocking;               //!< Loop filter usage determined by flags and parameters in bitstream 

  // Output sequence format related variables
  FrameFormat output;                   //!< output related information

#ifdef _LEAKYBUCKET_
  unsigned long R_decoder;                //!< Decoder Rate in HRD Model
  unsigned long B_decoder;                //!< Decoder Buffer size in HRD model
  unsigned long F_decoder;                //!< Decoder Initial buffer fullness in HRD model
  char LeakyBucketParamFile[FILE_NAME_SIZE];         //!< LeakyBucketParamFile
#endif

  // picture error concealment
  int ref_poc_gap;
  int poc_gap;
} InputParameters;

typedef struct old_slice_par
{
   unsigned field_pic_flag;   
   unsigned frame_num;
   int      nal_ref_idc;
   unsigned pic_oder_cnt_lsb;
   int      delta_pic_oder_cnt_bottom;
   int      delta_pic_order_cnt[2];
   byte     bottom_field_flag;
   byte     idr_flag;
   int      idr_pic_id;
   int      pps_id;   
} OldSliceParams;

typedef struct decoder_params
{
  InputParameters   *p_Inp;          //!< Input Parameters
  VideoParameters   *p_Vid;          //!< Image Parameters
  
} DecoderParams;

#ifdef TRACE
extern FILE *p_trace; //!< Trace file
extern int bitcounter;
#endif

// prototypes

extern void error(char *text, int code);

// dynamic mem allocation
extern int  init_global_buffers(VideoParameters *p_Vid);
extern void free_global_buffers(VideoParameters *p_Vid);

extern int RBSPtoSODB(byte *streamBuffer, int last_byte_pos);
extern int EBSPtoRBSP(byte *streamBuffer, int end_bytepos);

void FreePartition (DataPartition *dp, int n);
DataPartition *AllocPartition(int n);

void tracebits(const char *trace_str,  int len,  int info,int value1);
void tracebits2(const char *trace_str, int len, int info);

unsigned CeilLog2   ( unsigned uiVal);
unsigned CeilLog2_sf( unsigned uiVal);

// For 4:4:4 independent mode
extern void change_plane_JV( VideoParameters *p_Vid, int nplane );
extern void make_frame_picture_JV(VideoParameters *p_Vid);


#endif