1 files changed, 335 insertions, 105 deletions
diff --git a/runtime/arith.c b/runtime/arith.c
index 97121ae8..60576090 100644
--- a/runtime/arith.c
+++ b/runtime/arith.c
@@ -20,7 +20,7 @@
 /* 64-bit division for 64-bit cpus and i386 */
 /* Other 32-bit cpus will need to modify this file. */
 
-#ifdef __i386__
+#if defined (__i386__) || defined(__arm__)
 long long _div64 (long long u, long long v);
 long long _mod64 (long long u, long long v);
 #endif
@@ -46,7 +46,7 @@ int64_t _stp_div64 (const char **error, int64_t x, int64_t y)
 	if (unlikely (y == -1))
 		return -x;
 
-#ifdef __LP64__
+#if defined (__LP64__)
 	return x/y;
 #else
 	if (likely ((x >= LONG_MIN && x <= LONG_MAX) &&
@@ -72,7 +72,7 @@ int64_t _stp_mod64 (const char **error, int64_t x, int64_t y)
 	if (unlikely (y == 1 || y == -1))
 		return 0;
 	
-#ifdef __LP64__
+#if defined (__LP64__)
 	return x%y;
 #else
 	if (likely ((x >= LONG_MIN && x <= LONG_MAX) &&
@@ -106,14 +106,15 @@ int _stp_random_pm (int n)
 #endif /* _STP_TEST_ */
 
 
+#if defined (__i386__) || defined (__arm__)
 
-#ifdef __i386__
 /* 64-bit division functions extracted from libgcc */
 typedef long long DWtype;
 typedef unsigned long long UDWtype;
 typedef unsigned long UWtype;
 typedef long Wtype;
 typedef unsigned int USItype;
+typedef unsigned int UQItype	__attribute__ ((mode (QI)));
 
 #ifdef _BIG_ENDIAN
 struct DWstruct {Wtype high, low;};
@@ -121,8 +122,15 @@ struct DWstruct {Wtype high, low;};
 struct DWstruct {Wtype low, high;};
 #endif
 
+#define __CLOBBER_CC : "cc"
+
 #define W_TYPE_SIZE 32
 
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
 typedef union
 {
 	struct DWstruct s;
@@ -130,6 +138,7 @@ typedef union
 } DWunion;
 
 
+#if defined (__i386__)
 /* these are the i386 versions of these macros from gcc/longlong.h */
 
 #define umul_ppmm(w1, w0, u, v)			\
@@ -164,111 +173,330 @@ typedef union
 		(count) = __cbtmp ^ 31;					\
 	} while (0)
 
-inline UDWtype _stp_udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
+#elif defined (__arm__)
+
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "r" ((USItype) (ah)),					\
+	     "rI" ((USItype) (bh)),					\
+	     "r" ((USItype) (al)),					\
+	     "rI" ((USItype) (bl)) __CLOBBER_CC)
+#define umul_ppmm(xh, xl, a, b) \
+{register USItype __t0, __t1, __t2;					\
+  __asm__ ("%@ Inlined umul_ppmm\n"					\
+	   "	mov	%2, %5, lsr #16\n"				\
+	   "	mov	%0, %6, lsr #16\n"				\
+	   "	bic	%3, %5, %2, lsl #16\n"				\
+	   "	bic	%4, %6, %0, lsl #16\n"				\
+	   "	mul	%1, %3, %4\n"					\
+	   "	mul	%4, %2, %4\n"					\
+	   "	mul	%3, %0, %3\n"					\
+	   "	mul	%0, %2, %0\n"					\
+	   "	adds	%3, %4, %3\n"					\
+	   "	addcs	%0, %0, #65536\n"				\
+	   "	adds	%1, %1, %3, lsl #16\n"				\
+	   "	adc	%0, %0, %3, lsr #16"				\
+	   : "=&r" ((USItype) (xh)),					\
+	     "=r" ((USItype) (xl)),					\
+	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
+	   : "r" ((USItype) (a)),					\
+	     "r" ((USItype) (b)) __CLOBBER_CC );}
+
+#endif
+
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {									\
+    UWtype __d1, __d0, __q1, __q0;					\
+    UWtype __r1, __r0, __m;						\
+    __d1 = __ll_highpart (d);						\
+    __d0 = __ll_lowpart (d);						\
+									\
+    __r1 = (n1) % __d1;							\
+    __q1 = (n1) / __d1;							\
+    __m = (UWtype) __q1 * __d0;						\
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
+    if (__r1 < __m)							\
+      {									\
+	__q1--, __r1 += (d);						\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+	  if (__r1 < __m)						\
+	    __q1--, __r1 += (d);					\
+      }									\
+    __r1 -= __m;							\
+									\
+    __r0 = __r1 % __d1;							\
+    __q0 = __r1 / __d1;							\
+    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
+    if (__r0 < __m)							\
+      {									\
+	__q0--, __r0 += (d);						\
+	if (__r0 >= (d))						\
+	  if (__r0 < __m)						\
+	    __q0--, __r0 += (d);					\
+      }									\
+    __r0 -= __m;							\
+									\
+    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (r) = __r0;								\
+  } while (0)
+
+#if !defined (udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#else
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
+
+#if !defined (count_leading_zeros)
+const UQItype _stp_clz_tab[256] =
 {
-	const DWunion nn = {.ll = n};
-	const DWunion dd = {.ll = d};
-	DWunion ww,rr;
-	UWtype d0, d1, n0, n1, n2;
-	UWtype q0, q1;
-	UWtype b, bm;
-	
-	d0 = dd.s.low;
-	d1 = dd.s.high;
-	n0 = nn.s.low;
-	n1 = nn.s.high;
-	
-	if (d1 == 0) {
-		if (d0 > n1)	{
-			/* 0q = nn / 0D */
-			udiv_qrnnd (q0, n0, n1, n0, d0);
-			q1 = 0;
-			/* Remainder in n0.  */
-		} else {
-			/* qq = NN / 0d */
-			if (d0 == 0)
-				d0 = 1 / d0;	/* Divide intentionally by zero.  */
-			udiv_qrnnd (q1, n1, 0, n1, d0);
-			udiv_qrnnd (q0, n0, n1, n0, d0);
-			/* Remainder in n0.  */
+  0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+};
+
+#define count_leading_zeros(count, x) \
+  do {									\
+    UWtype __xr = (x);							\
+    UWtype __a;								\
+									\
+    if (W_TYPE_SIZE <= 32)						\
+      {									\
+	__a = __xr < ((UWtype)1<<2*__BITS4)				\
+	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
+	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
+      }									\
+    else								\
+      {									\
+	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
+	  if (((__xr >> __a) & 0xff) != 0)				\
+	    break;							\
+      }									\
+									\
+    (count) = W_TYPE_SIZE - (_stp_clz_tab[__xr >> __a] + __a);		\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#endif
+
+UDWtype
+_stp_udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
+{
+  const DWunion nn = {.ll = n};
+  const DWunion dd = {.ll = d};
+  DWunion ww, rr;
+  UWtype d0, d1, n0, n1, n2;
+  UWtype q0, q1;
+  UWtype b, bm;
+
+  d0 = dd.s.low;
+  d1 = dd.s.high;
+  n0 = nn.s.low;
+  n1 = nn.s.high;
+
+#if !UDIV_NEEDS_NORMALIZATION
+  if (d1 == 0)
+    {
+      if (d0 > n1)
+	{
+	  /* 0q = nn / 0D */
+
+	  udiv_qrnnd (q0, n0, n1, n0, d0);
+	  q1 = 0;
+
+	  /* Remainder in n0.  */
+	}
+      else
+	{
+	  /* qq = NN / 0d */
+
+	  if (d0 == 0)
+	    d0 = 1 / d0;	/* Divide intentionally by zero.  */
+
+	  udiv_qrnnd (q1, n1, 0, n1, d0);
+	  udiv_qrnnd (q0, n0, n1, n0, d0);
+
+	  /* Remainder in n0.  */
+	}
+
+      if (rp != 0)
+	{
+	  rr.s.low = n0;
+	  rr.s.high = 0;
+	  *rp = rr.ll;
+	}
+    }
+
+#else /* UDIV_NEEDS_NORMALIZATION */
+
+  if (d1 == 0)
+    {
+      if (d0 > n1)
+	{
+	  /* 0q = nn / 0D */
+
+	  count_leading_zeros (bm, d0);
+
+	  if (bm != 0)
+	    {
+	      /* Normalize, i.e. make the most significant bit of the
+		 denominator set.  */
+
+	      d0 = d0 << bm;
+	      n1 = (n1 << bm) | (n0 >> (W_TYPE_SIZE - bm));
+	      n0 = n0 << bm;
+	    }
+
+	  udiv_qrnnd (q0, n0, n1, n0, d0);
+	  q1 = 0;
+
+	  /* Remainder in n0 >> bm.  */
+	}
+      else
+	{
+	  /* qq = NN / 0d */
+
+	  if (d0 == 0)
+	    d0 = 1 / d0;	/* Divide intentionally by zero.  */
+
+	  count_leading_zeros (bm, d0);
+
+	  if (bm == 0)
+	    {
+	      /* From (n1 >= d0) /\ (the most significant bit of d0 is set),
+		 conclude (the most significant bit of n1 is set) /\ (the
+		 leading quotient digit q1 = 1).
+
+		 This special case is necessary, not an optimization.
+		 (Shifts counts of W_TYPE_SIZE are undefined.)  */
+
+	      n1 -= d0;
+	      q1 = 1;
+	    }
+	  else
+	    {
+	      /* Normalize.  */
+
+	      b = W_TYPE_SIZE - bm;
+
+	      d0 = d0 << bm;
+	      n2 = n1 >> b;
+	      n1 = (n1 << bm) | (n0 >> b);
+	      n0 = n0 << bm;
+
+	      udiv_qrnnd (q1, n1, n2, n1, d0);
+	    }
+
+	  /* n1 != d0...  */
+
+	  udiv_qrnnd (q0, n0, n1, n0, d0);
+
+	  /* Remainder in n0 >> bm.  */
+	}
+
+      if (rp != 0)
+	{
+	  rr.s.low = n0 >> bm;
+	  rr.s.high = 0;
+	  *rp = rr.ll;
+	}
+    }
+#endif /* UDIV_NEEDS_NORMALIZATION */
+
+  else
+    {
+      if (d1 > n1)
+	{
+	  /* 00 = nn / DD */
+
+	  q0 = 0;
+	  q1 = 0;
+
+	  /* Remainder in n1n0.  */
+	  if (rp != 0)
+	    {
+	      rr.s.low = n0;
+	      rr.s.high = n1;
+	      *rp = rr.ll;
+	    }
+	}
+      else
+	{
+	  /* 0q = NN / dd */
+
+	  count_leading_zeros (bm, d1);
+	  if (bm == 0)
+	    {
+	      /* From (n1 >= d1) /\ (the most significant bit of d1 is set),
+		 conclude (the most significant bit of n1 is set) /\ (the
+		 quotient digit q0 = 0 or 1).
+
+		 This special case is necessary, not an optimization.  */
+
+	      /* The condition on the next line takes advantage of that
+		 n1 >= d1 (true due to program flow).  */
+	      if (n1 > d1 || n0 >= d0)
+		{
+		  q0 = 1;
+		  sub_ddmmss (n1, n0, n1, n0, d1, d0);
 		}
-		
-		if (rp != 0)	{
-			rr.s.low = n0;
-			rr.s.high = 0;
-			*rp = rr.ll;
+	      else
+		q0 = 0;
+
+	      q1 = 0;
+
+	      if (rp != 0)
+		{
+		  rr.s.low = n0;
+		  rr.s.high = n1;
+		  *rp = rr.ll;
 		}
-	} else {
-		if (d1 > n1) {
-			/* 00 = nn / DD */
-			q0 = 0;
-			q1 = 0;
-			
-			/* Remainder in n1n0.  */
-			if (rp != 0) {
-				rr.s.low = n0;
-				rr.s.high = n1;
-				*rp = rr.ll;
-			}
-		} else {
-			/* 0q = NN / dd */
-			count_leading_zeros (bm, d1);
-			if (bm == 0) {
-				/* From (n1 >= d1) /\ (the most significant bit of d1 is set),
-				   conclude (the most significant bit of n1 is set) /\ (the
-				   quotient digit q0 = 0 or 1).
-				   This special case is necessary, not an optimization.  */
-				
-				/* The condition on the next line takes advantage of that
-				   n1 >= d1 (true due to program flow).  */
-				if (n1 > d1 || n0 >= d0) {
-					q0 = 1;
-					sub_ddmmss (n1, n0, n1, n0, d1, d0);
-				} else
-					q0 = 0;
-				
-				q1 = 0;
-				
-				if (rp != 0) {
-					rr.s.low = n0;
-					rr.s.high = n1;
-					*rp = rr.ll;
-				}
-			} else {
-				UWtype m1, m0;
-				/* Normalize.  */
-				
-				b = W_TYPE_SIZE - bm;
-				
-				d1 = (d1 << bm) | (d0 >> b);
-				d0 = d0 << bm;
-				n2 = n1 >> b;
-				n1 = (n1 << bm) | (n0 >> b);
-				n0 = n0 << bm;
-		      
-				udiv_qrnnd (q0, n1, n2, n1, d1);
-				umul_ppmm (m1, m0, q0, d0);
-		      
-				if (m1 > n1 || (m1 == n1 && m0 > n0)) {
-					q0--;
-					sub_ddmmss (m1, m0, m1, m0, d1, d0);
-				}
-		      
-				q1 = 0;
-		      
-				/* Remainder in (n1n0 - m1m0) >> bm.  */
-				if (rp != 0) {
-					sub_ddmmss (n1, n0, n1, n0, m1, m0);
-					rr.s.low = (n1 << b) | (n0 >> bm);
-					rr.s.high = n1 >> bm;
-					*rp = rr.ll;
-				}
-			}
+	    }
+	  else
+	    {
+	      UWtype m1, m0;
+	      /* Normalize.  */
+
+	      b = W_TYPE_SIZE - bm;
+
+	      d1 = (d1 << bm) | (d0 >> b);
+	      d0 = d0 << bm;
+	      n2 = n1 >> b;
+	      n1 = (n1 << bm) | (n0 >> b);
+	      n0 = n0 << bm;
+
+	      udiv_qrnnd (q0, n1, n2, n1, d1);
+	      umul_ppmm (m1, m0, q0, d0);
+
+	      if (m1 > n1 || (m1 == n1 && m0 > n0))
+		{
+		  q0--;
+		  sub_ddmmss (m1, m0, m1, m0, d1, d0);
+		}
+
+	      q1 = 0;
+
+	      /* Remainder in (n1n0 - m1m0) >> bm.  */
+	      if (rp != 0)
+		{
+		  sub_ddmmss (n1, n0, n1, n0, m1, m0);
+		  rr.s.low = (n1 << b) | (n0 >> bm);
+		  rr.s.high = n1 >> bm;
+		  *rp = rr.ll;
 		}
+	    }
 	}
-  
-	ww.s.low = q0; ww.s.high = q1;
-	return ww.ll;
+    }
+
+  ww.s.low = q0; ww.s.high = q1;
+  return ww.ll;
 }
 
 long long _div64 (long long u, long long v)
@@ -311,5 +539,7 @@ long long _mod64 (long long u, long long v)
 	
 	return w;
 }
-#endif /* __i386__ */
+
+#endif /* __i386__ || __arm__ */
+
 #endif /* _ARITH_C_ */