/* -*- linux-c -*- */
/* Math functions
 * Copyright (C) 2005 Red Hat Inc.
 * Portions (C) Free Software Foundation, Inc.
 *
 * This file is part of systemtap, and is free software.  You can
 * redistribute it and/or modify it under the terms of the GNU General
 * Public License (GPL); either version 2, or (at your option) any
 * later version.
 */

#ifndef _ARITH_C_ 
#define _ARITH_C_

/** @file arith.
 * @brief Implements various arithmetic-related helper functions
 */


/* 64-bit division for 64-bit cpus and i386 */
/* Other 32-bit cpus will need to modify this file. */

#if defined (__i386__) || defined(__arm__) || \
	(defined(__powerpc__) && !defined(__powerpc64__))
static long long _div64 (long long u, long long v);
static long long _mod64 (long long u, long long v);
#endif

/* 31 bit s390 suupport is not yet included, it may never be.
#ifdef __s390__
static long long _div64 (long long u, long long v);
static long long _mod64 (long long u, long long v);
#endif
*/

/** Divide x by y.  In case of division-by-zero,
 * set context error string, and return 0
 */
static int64_t _stp_div64 (const char **error, int64_t x, int64_t y)
{
	// check for division-by-zero
	if (unlikely (y == 0)) {
		if (error) *error = "division by 0";
		return 0;
	}

	if (unlikely (y == -1))
		return -x;

#if defined (__LP64__)
	return x/y;
#else
	if (likely ((x >= LONG_MIN && x <= LONG_MAX) &&
		    (y >= LONG_MIN && y <= LONG_MAX))) {
		return (long)x / (long)y;
	} else
		return _div64 (x, y);
#endif
}


/** Modulo x by y.  In case of division-by-zero,
 * set context error string, and return any 0
 */
static int64_t _stp_mod64 (const char **error, int64_t x, int64_t y)
{
	// check for division-by-zero
	if (unlikely (y == 0)) {
		if (error) *error = "division by 0";
		return 0;
	}
	
	if (unlikely (y == 1 || y == -1))
		return 0;
	
#if defined (__LP64__)
	return x%y;
#else
	if (likely ((x >= LONG_MIN && x <= LONG_MAX) &&
		    (y >= LONG_MIN && y <= LONG_MAX))) {
		return (long)x % (long)y;
	} else
		return _mod64 (x, y);
#endif
}


/** Return a random integer between 0 and n - 1.
 * @param n how far from zero to go.  Make it positive but less than a million or so.
 */
static unsigned long _stp_random_u (unsigned long n)
{
	static unsigned long seed;
	static int initialized_p = 0;

	if (unlikely (! initialized_p)) {
		seed = (unsigned long) jiffies;
		initialized_p = 1;
	}

	/* from glibc rand man page */
	seed = seed * 1103515245 + 12345;

	return (n == 0 ? 0 : seed % n);
}


/** Return a random integer between -n and n.
 * @param n how far from zero to go.  Make it positive but less than a million or so.
 */
static int _stp_random_pm (unsigned n)
{
        return -(int)n + (int)_stp_random_u (2*n + 1);
}


#if defined (__i386__) || defined (__arm__) || \
	(defined(__powerpc__) && !defined(__powerpc64__))

/* 64-bit division functions extracted from libgcc */
typedef long long DWtype;
typedef unsigned long long UDWtype;
typedef unsigned long UWtype;
typedef long Wtype;
typedef unsigned int USItype;
typedef unsigned int UQItype	__attribute__ ((mode (QI)));

#ifdef _BIG_ENDIAN
struct DWstruct {Wtype high, low;};
#else
struct DWstruct {Wtype low, high;};
#endif

#define __CLOBBER_CC : "cc"

#define W_TYPE_SIZE 32

#define __BITS4 (W_TYPE_SIZE / 4)
#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))

typedef union
{
	struct DWstruct s;
	DWtype ll;
} DWunion;


#if defined (__i386__)
/* these are the i386 versions of these macros from gcc/longlong.h */

#define umul_ppmm(w1, w0, u, v)			\
	__asm__ ("mull %3"			\
		 : "=a" ((USItype) (w0)),	\
		   "=d" ((USItype) (w1))	\
		 : "%0" ((USItype) (u)),	\
		   "rm" ((USItype) (v)))

#define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
	__asm__ ("subl %5,%1\n\tsbbl %3,%0"	\
		 : "=r" ((USItype) (sh)),	\
		   "=&r" ((USItype) (sl))	\
		 : "0" ((USItype) (ah)),	\
		   "g" ((USItype) (bh)),	\
		   "1" ((USItype) (al)),	\
		   "g" ((USItype) (bl)))

#define udiv_qrnnd(q, r, n1, n0, dv)		\
	__asm__ ("divl %4"			\
		 : "=a" ((USItype) (q)),	\
		   "=d" ((USItype) (r))		\
		 : "0" ((USItype) (n0)),	\
		   "1" ((USItype) (n1)),	\
		   "rm" ((USItype) (dv)))

#define count_leading_zeros(count, x)					\
	do {								\
		USItype __cbtmp;					\
		__asm__ ("bsrl %1,%0"					\
			 : "=r" (__cbtmp) : "rm" ((USItype) (x)));	\
		(count) = __cbtmp ^ 31;					\
	} while (0)

#elif defined (__powerpc__)
/* these are the ppc versions of these macros from gcc/longlong.h */

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
  do {									\
    if (__builtin_constant_p (ah) && (ah) == 0)				\
      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    else if (__builtin_constant_p (bh) && (bh) == 0)			\
      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    else								\
      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
	       : "=r" (sh), "=&r" (sl)					\
	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
  } while (0)

#define count_leading_zeros(count, x) \
  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
#define COUNT_LEADING_ZEROS_0 32

#define umul_ppmm(ph, pl, m0, m1) \
  do {									\
    USItype __m0 = (m0), __m1 = (m1);					\
    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    (pl) = __m0 * __m1;							\
  } while (0)

#elif defined (__arm__)

#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
  __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
	   : "=r" ((USItype) (sh)),					\
	     "=&r" ((USItype) (sl))					\
	   : "r" ((USItype) (ah)),					\
	     "rI" ((USItype) (bh)),					\
	     "r" ((USItype) (al)),					\
	     "rI" ((USItype) (bl)) __CLOBBER_CC)
#define umul_ppmm(xh, xl, a, b) \
{register USItype __t0, __t1, __t2;					\
  __asm__ ("%@ Inlined umul_ppmm\n"					\
	   "	mov	%2, %5, lsr #16\n"				\
	   "	mov	%0, %6, lsr #16\n"				\
	   "	bic	%3, %5, %2, lsl #16\n"				\
	   "	bic	%4, %6, %0, lsl #16\n"				\
	   "	mul	%1, %3, %4\n"					\
	   "	mul	%4, %2, %4\n"					\
	   "	mul	%3, %0, %3\n"					\
	   "	mul	%0, %2, %0\n"					\
	   "	adds	%3, %4, %3\n"					\
	   "	addcs	%0, %0, #65536\n"				\
	   "	adds	%1, %1, %3, lsl #16\n"				\
	   "	adc	%0, %0, %3, lsr #16"				\
	   : "=&r" ((USItype) (xh)),					\
	     "=r" ((USItype) (xl)),					\
	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
	   : "r" ((USItype) (a)),					\
	     "r" ((USItype) (b)) __CLOBBER_CC );}

#endif

#define __udiv_qrnnd_c(q, r, n1, n0, d) \
  do {									\
    UWtype __d1, __d0, __q1, __q0;					\
    UWtype __r1, __r0, __m;						\
    __d1 = __ll_highpart (d);						\
    __d0 = __ll_lowpart (d);						\
									\
    __r1 = (n1) % __d1;							\
    __q1 = (n1) / __d1;							\
    __m = (UWtype) __q1 * __d0;						\
    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
    if (__r1 < __m)							\
      {									\
	__q1--, __r1 += (d);						\
	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
	  if (__r1 < __m)						\
	    __q1--, __r1 += (d);					\
      }									\
    __r1 -= __m;							\
									\
    __r0 = __r1 % __d1;							\
    __q0 = __r1 / __d1;							\
    __m = (UWtype) __q0 * __d0;						\
    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
    if (__r0 < __m)							\
      {									\
	__q0--, __r0 += (d);						\
	if (__r0 >= (d))						\
	  if (__r0 < __m)						\
	    __q0--, __r0 += (d);					\
      }									\
    __r0 -= __m;							\
									\
    (q) = (UWtype) __q1 * __ll_B | __q0;				\
    (r) = __r0;								\
  } while (0)

#if !defined (udiv_qrnnd)
#define UDIV_NEEDS_NORMALIZATION 1
#define udiv_qrnnd __udiv_qrnnd_c
#else
#define UDIV_NEEDS_NORMALIZATION 0
#endif

#if !defined (count_leading_zeros)
static const UQItype _stp_clz_tab[256] =
{
  0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
};

#define count_leading_zeros(count, x) \
  do {									\
    UWtype __xr = (x);							\
    UWtype __a;								\
									\
    if (W_TYPE_SIZE <= 32)						\
      {									\
	__a = __xr < ((UWtype)1<<2*__BITS4)				\
	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
      }									\
    else								\
      {									\
	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
	  if (((__xr >> __a) & 0xff) != 0)				\
	    break;							\
      }									\
									\
    (count) = W_TYPE_SIZE - (_stp_clz_tab[__xr >> __a] + __a);		\
  } while (0)
#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
#endif

static UDWtype
_stp_udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
{
  const DWunion nn = {.ll = n};
  const DWunion dd = {.ll = d};
  DWunion ww, rr;
  UWtype d0, d1, n0, n1, n2;
  UWtype q0, q1;
  UWtype b, bm;

  d0 = dd.s.low;
  d1 = dd.s.high;
  n0 = nn.s.low;
  n1 = nn.s.high;

#if !UDIV_NEEDS_NORMALIZATION
  if (d1 == 0)
    {
      if (d0 > n1)
	{
	  /* 0q = nn / 0D */

	  udiv_qrnnd (q0, n0, n1, n0, d0);
	  q1 = 0;

	  /* Remainder in n0.  */
	}
      else
	{
	  /* qq = NN / 0d */

	  if (d0 == 0)
	    d0 = 1 / d0;	/* Divide intentionally by zero.  */

	  udiv_qrnnd (q1, n1, 0, n1, d0);
	  udiv_qrnnd (q0, n0, n1, n0, d0);

	  /* Remainder in n0.  */
	}

      if (rp != 0)
	{
	  rr.s.low = n0;
	  rr.s.high = 0;
	  *rp = rr.ll;
	}
    }

#else /* UDIV_NEEDS_NORMALIZATION */

  if (d1 == 0)
    {
      if (d0 > n1)
	{
	  /* 0q = nn / 0D */

	  count_leading_zeros (bm, d0);

	  if (bm != 0)
	    {
	      /* Normalize, i.e. make the most significant bit of the
		 denominator set.  */

	      d0 = d0 << bm;
	      n1 = (n1 << bm) | (n0 >> (W_TYPE_SIZE - bm));
	      n0 = n0 << bm;
	    }

	  udiv_qrnnd (q0, n0, n1, n0, d0);
	  q1 = 0;

	  /* Remainder in n0 >> bm.  */
	}
      else
	{
	  /* qq = NN / 0d */

	  if (d0 == 0)
	    d0 = 1 / d0;	/* Divide intentionally by zero.  */

	  count_leading_zeros (bm, d0);

	  if (bm == 0)
	    {
	      /* From (n1 >= d0) /\ (the most significant bit of d0 is set),
		 conclude (the most significant bit of n1 is set) /\ (the
		 leading quotient digit q1 = 1).

		 This special case is necessary, not an optimization.
		 (Shifts counts of W_TYPE_SIZE are undefined.)  */

	      n1 -= d0;
	      q1 = 1;
	    }
	  else
	    {
	      /* Normalize.  */

	      b = W_TYPE_SIZE - bm;

	      d0 = d0 << bm;
	      n2 = n1 >> b;
	      n1 = (n1 << bm) | (n0 >> b);
	      n0 = n0 << bm;

	      udiv_qrnnd (q1, n1, n2, n1, d0);
	    }

	  /* n1 != d0...  */

	  udiv_qrnnd (q0, n0, n1, n0, d0);

	  /* Remainder in n0 >> bm.  */
	}

      if (rp != 0)
	{
	  rr.s.low = n0 >> bm;
	  rr.s.high = 0;
	  *rp = rr.ll;
	}
    }
#endif /* UDIV_NEEDS_NORMALIZATION */

  else
    {
      if (d1 > n1)
	{
	  /* 00 = nn / DD */

	  q0 = 0;
	  q1 = 0;

	  /* Remainder in n1n0.  */
	  if (rp != 0)
	    {
	      rr.s.low = n0;
	      rr.s.high = n1;
	      *rp = rr.ll;
	    }
	}
      else
	{
	  /* 0q = NN / dd */

	  count_leading_zeros (bm, d1);
	  if (bm == 0)
	    {
	      /* From (n1 >= d1) /\ (the most significant bit of d1 is set),
		 conclude (the most significant bit of n1 is set) /\ (the
		 quotient digit q0 = 0 or 1).

		 This special case is necessary, not an optimization.  */

	      /* The condition on the next line takes advantage of that
		 n1 >= d1 (true due to program flow).  */
	      if (n1 > d1 || n0 >= d0)
		{
		  q0 = 1;
		  sub_ddmmss (n1, n0, n1, n0, d1, d0);
		}
	      else
		q0 = 0;

	      q1 = 0;

	      if (rp != 0)
		{
		  rr.s.low = n0;
		  rr.s.high = n1;
		  *rp = rr.ll;
		}
	    }
	  else
	    {
	      UWtype m1, m0;
	      /* Normalize.  */

	      b = W_TYPE_SIZE - bm;

	      d1 = (d1 << bm) | (d0 >> b);
	      d0 = d0 << bm;
	      n2 = n1 >> b;
	      n1 = (n1 << bm) | (n0 >> b);
	      n0 = n0 << bm;

	      udiv_qrnnd (q0, n1, n2, n1, d1);
	      umul_ppmm (m1, m0, q0, d0);

	      if (m1 > n1 || (m1 == n1 && m0 > n0))
		{
		  q0--;
		  sub_ddmmss (m1, m0, m1, m0, d1, d0);
		}

	      q1 = 0;

	      /* Remainder in (n1n0 - m1m0) >> bm.  */
	      if (rp != 0)
		{
		  sub_ddmmss (n1, n0, n1, n0, m1, m0);
		  rr.s.low = (n1 << b) | (n0 >> bm);
		  rr.s.high = n1 >> bm;
		  *rp = rr.ll;
		}
	    }
	}
    }

  ww.s.low = q0; ww.s.high = q1;
  return ww.ll;
}

static long long _div64 (long long u, long long v)
{
	long c = 0;
	DWunion uu = {.ll = u};
	DWunion vv = {.ll = v};
	DWtype w;
	
	if (uu.s.high < 0)
		c = ~c,
			uu.ll = -uu.ll;
	if (vv.s.high < 0)
		c = ~c,
			vv.ll = -vv.ll;
	
	w = _stp_udivmoddi4 (uu.ll, vv.ll, (UDWtype *) 0);
	if (c)
		w = -w;
	
	return w;
}

static long long _mod64 (long long u, long long v)
{
	long c = 0;
	DWunion uu = {.ll = u};
	DWunion vv = {.ll = v};
	DWtype w;
	
	if (uu.s.high < 0)
		c = ~c,
			uu.ll = -uu.ll;
	if (vv.s.high < 0)
		vv.ll = -vv.ll;
	
	(void) _stp_udivmoddi4 (uu.ll, vv.ll, (UDWtype*)&w);
	if (c)
		w = -w;
	
	return w;
}

#endif /* __i386__ || __arm__ || (__powerpc__ && !__powerpc64__) */

#endif /* _ARITH_C_ */