/*********************************************
 *
 * This code was written by Henrik Stewenius in 2006. You are free to 
 * use the code in research. For commercial or military applications 
 * please contact the Center for Visualization and Virtual 
 * Environments for licensing. 
 * The center for Visualization does have a rootsolver
 *
 * If you use this code in you reasearch please refer to: 
 * @INPROCEEDINGS{stewenius-engels-nister,
 *    author = {H. Stew\'enius and C. Engles and D. Nist\'er},
 *    title = {An Efficient Minimal Solution for Infinitesimal Camera Motion},
 *    booktitle = CVPR,
 *    year = 2007,
 *    month = JUN  
 * }
 *
 *
 * I do not claim that the code is fit for any particular purpose
 * and unless you input a polynomial solver it will not even compile. 
 *
 * Apart from the polynomial solver this is the code which gave 
 * the run-times claimed in the paper. For that we used a Sturm-Sequence
 * based solver. 
 *
 *
 * When it comes to inputting a polynomial root-solver, I will not help 
 * debugging the results but i will offer some advice: 
 *   1) If the answer is wrong. Try taking the coefficients in the 
 *        opposite order. There is no completely natural way in which a 
 *        polynomial should be represented. 
 *
 *   2) Remember that a degree 10 polynomial has 11 coefficients! 
 *
 *   3) __restrict__ is a keyword which works under gcc. It is possible 
 *        that it works under other compilers but it might have different
 *        names. Formally speaking i think restrict is valid in C but not 
 *        in C++. 
 *
 *   4) Defining HS_COMPILE_MEX sets up for compiling mex. A compile line 
 *      for this is probably inspired by:
 *        gcc -O3  -march=pentium4 -mfpmath=sse   -D HS_COMPILE_MEX -Wall -O3 -shared -I/usr/local/matlab/extern/include  fivepoint_infinitesimal_fast.c -o five_inf.mexglx
 *      For other compilers you are on your own. 
 *
 *
 *   5) I repeat: IF YOU DO NOT INCLUDE A ROOTSOLVER IT WILL NOT COMPILE!
 *
 *   6) If you licence this software from the center for visualization you 
 *        probably also want to licence the rootsolver in vv_poly.h. 
 *********************************************/

#error This THING DOES NOT COMPILE UNLESS YOU INCLUDE  A ROOT SOLVER! Substitute MY_SPECIAL_POLYSOLVER with your favourite rootsolver. 
// #include "something_providing_a_polynomial_solver.h"
#define FLOAT_T double 





#ifndef HS_COMPILE_MEX
void print_matrix(double *A, int h, int w){
}
#else
#include <stdio.h>
void print_matrix(double *A, int h, int w){
  int i,j; 
  printf("\n"); 
  for(i=0; i < h; i++){
    for(j=0; j < w; j++)
      printf("%f ", A[i*w+j]); 
    printf("\n"); 
  }
  printf("\n"); 
} 
#endif 




inline void convkernel_1_2(double *__restrict__ p,const double *__restrict__ p1,const double *__restrict__ p2){
  p[0] = p1[0]*p2[0];
  p[1] = p1[0]*p2[1]+p1[1]*p2[0];
  p[2] = p1[0]*p2[2]+p1[1]*p2[1];
  p[3] = p1[1]*p2[2];
}

inline void convkernel_1_2_sub_from(double *__restrict__ p,const double *__restrict__ p1,const double *__restrict__ p2){
  p[0] -= p1[0]*p2[0];
  p[1] -= p1[0]*p2[1]+p1[1]*p2[0];
  p[2] -= p1[0]*p2[2]+p1[1]*p2[1];
  p[3] -= p1[1]*p2[2];
}


inline void convkernel_3_4(double *__restrict__  p,const double *__restrict__ p1,const double *__restrict__ p2){
  p[0] = p1[0]*p2[0];
  p[1] = p1[0]*p2[1]+p1[1]*p2[0];
  p[2] = p1[0]*p2[2]+p1[1]*p2[1]+p1[2]*p2[0];
  p[3] = p1[0]*p2[3]+p1[1]*p2[2]+p1[2]*p2[1]+p1[3]*p2[0];
  p[4] = p1[0]*p2[4]+p1[1]*p2[3]+p1[2]*p2[2]+p1[3]*p2[1];
  p[5] = p1[1]*p2[4]+p1[2]*p2[3]+p1[3]*p2[2];
  p[6] = p1[2]*p2[4]+p1[3]*p2[3];
  p[7] = p1[3]*p2[4];
}


inline void convkernel_3_4_sub_from(double *__restrict__ p,const double *__restrict__ p1,const double *__restrict__ p2){
  p[0] -= p1[0]*p2[0];
  p[1] -= p1[0]*p2[1]+p1[1]*p2[0];
  p[2] -= p1[0]*p2[2]+p1[1]*p2[1]+p1[2]*p2[0];
  p[3] -= p1[0]*p2[3]+p1[1]*p2[2]+p1[2]*p2[1]+p1[3]*p2[0];
  p[4] -= p1[0]*p2[4]+p1[1]*p2[3]+p1[2]*p2[2]+p1[3]*p2[1];
  p[5] -= p1[1]*p2[4]+p1[2]*p2[3]+p1[3]*p2[2];
  p[6] -= p1[2]*p2[4]+p1[3]*p2[3];
  p[7] -= p1[3]*p2[4];
}




inline void convkernel_3_7_add_to(double *__restrict__ p,const double *__restrict__ p1,const double *__restrict__ p2){
  p[0] += p1[0]*p2[0];
  p[1] += p1[0]*p2[1]+p1[1]*p2[0];
  p[2] += p1[0]*p2[2]+p1[1]*p2[1]+p1[2]*p2[0];
  p[3] += p1[0]*p2[3]+p1[1]*p2[2]+p1[2]*p2[1]+p1[3]*p2[0];
  p[4] += p1[0]*p2[4]+p1[1]*p2[3]+p1[2]*p2[2]+p1[3]*p2[1];
  p[5] += p1[0]*p2[5]+p1[2]*p2[3]+p1[1]*p2[4]+p1[3]*p2[2];
  p[6] += p1[3]*p2[3]+p1[1]*p2[5]+p1[0]*p2[6]+p1[2]*p2[4];
  p[7] += p1[3]*p2[4]+p1[0]*p2[7]+p1[1]*p2[6]+p1[2]*p2[5];
  p[8] += p1[2]*p2[6]+p1[1]*p2[7]+p1[3]*p2[5];
  p[9] += p1[2]*p2[7]+p1[3]*p2[6];
  p[10] += p1[3]*p2[7];
}



/* Compute the nullspace by computing determinants.
   This can be optimized further by computing the 
   determinants as expansion along a row. 
 */
inline void hs_NullSpace_d3x4( double *__restrict__ ns, const double *__restrict__ A){
  //Computing the nullspace by compting determinants of 3x3 blocks. 
  
  //First compute all 5 2x2 blocks of the two first rows. 
  double subdets[6]; 
  subdets[0] = A[0]*A[5]-A[1]*A[4];
  subdets[1] = A[0]*A[6]-A[2]*A[4];
  subdets[2] = A[0]*A[7]-A[3]*A[4]; 
  subdets[3] = A[1]*A[6]-A[2]*A[5]; 
  subdets[4] = A[1]*A[7]-A[3]*A[5]; 
  subdets[5] = A[2]*A[7]-A[3]*A[6]; 

  //Then compute the actual determintants. 
  ns[0] = A[9]*subdets[5] - A[10]*subdets[4]+A[11]*subdets[3]; 
  ns[1] = -A[8]*subdets[5] + A[10]*subdets[2] - A[11]*subdets[1]; 
  ns[2]= A[8]*subdets[4]-A[9]*subdets[2]+A[11]*subdets[0]; 
  ns[3]= -A[8]*subdets[3]+A[9]*subdets[1]-A[10]*subdets[0]; 
}




void elim_A( double *A){
  /*Pivot columns are 0,1,2,4,5*/
  int pivcols[] ={0,1,2,4,5}; 
  int i, j, k; 
  for(i=0; i < 5; i++){
    int col = pivcols[i]; 
    double *__restrict__ A_pivrow = &A[i*12]; 
    double temp_inv = 1/A_pivrow[col];
    for(k=0;k<12;k++)
      A_pivrow[k] *= temp_inv;    
    for(j=0;j<5;j++){
      if( i!= j){
	double *__restrict__ A_currrow = &A[j*12]; 
	double temp = -A_currrow[col]; 
	for(k=0;k<12;k++)
	  A_currrow[k] += temp*A_pivrow[k];
      }
    }      
  }
}




void compute_A( double * __restrict__ A, const double *__restrict__ u, const double *__restrict__ up){
  int i; 
  for(i=0; i < 5; i++){
    const double *__restrict__ ut = &u[3*i];
    const double *__restrict__ upt = &up[3*i];
    double *__restrict__ At = &A[4*3*i];

    At[0] = ut[2]*ut[2]+ut[1]*ut[1];
    At[1] = -ut[1]*ut[0];  //1
    At[2] = -ut[2]*ut[0];  //2
    At[3] = At[1]; // -ut[1]*ut[0];  //1 
    At[4] = ut[2]* ut[2]+ut[0]*ut[0];
    At[5] = -ut[2]*ut[1];  //3
    At[6] = At[2]; // -ut[2]*ut[0];  //2
    At[7] = At[5]; // -ut[2]*ut[1];  //3
    At[8] = ut[1]*ut[1]+ut[0]*ut[0];
    At[9] = ut[2]*upt[1]-ut[1]*upt[2];
    At[10] = -ut[2]*upt[0]+ut[0]*upt[2];
    At[11] = ut[1]*upt[0]-ut[0]*upt[1];
  }
  elim_A(A); 
}



inline void elim_A_special( double *A){

  //int pivcols[] ={3,4,5,6,7}; 
  int i, j, k; 
  for(i=0; i < 5; i++){
    int col = i+3; // pivcols[i]; 
    double *__restrict__ A_pivrow = &A[i*12]; 
    double temp_inv = 1/A_pivrow[col];
    for(k=3;k<12;k++)
      A_pivrow[k] *= temp_inv;    
    for(j=(i+1);j<5;j++){
      //      if( i!= j){
      double *__restrict__ A_currrow = &A[j*12]; 
      double temp = -A_currrow[col]; 
      for(k=3;k<12;k++)
	A_currrow[k] += temp*A_pivrow[k];
	//}
    }      
  }
  //  print_matrix( A, 5, 12); 
  for(i=4; i >= 0; i--){
    double *__restrict__ A_pivrow = &A[i*12]; 
    for(j=0; j< i ;j++){
      double *__restrict__ A_currrow = &A[j*12]; 
      double temp = A_currrow[i+3];
      //  printf("temp %f\n" ,temp);
      A_currrow[8] -= temp*A_pivrow[8];
      A_currrow[9] -= temp*A_pivrow[9];
      A_currrow[10] -= temp*A_pivrow[10];
      A_currrow[11] -= temp*A_pivrow[11];
    }
  }
  //  print_matrix( A, 5, 12); 
}




void compute_A_special( double * __restrict__ A, const double *__restrict__ u, const double *__restrict__ up){
  int i; 
  for(i=0; i < 5; i++){
    const double *__restrict__ ut = &u[3*i];
    const double *__restrict__ upt = &up[3*i];
    double *__restrict__ At = &A[4*3*i];
  /*Pivot columns are 0,1,2,4,5*/
    At[3] = ut[2]*ut[2]+ut[1]*ut[1];
    At[4] = -ut[1]*ut[0];  //1
    At[5] = -ut[2]*ut[0];  //2

    //     At[3] = At[1]; // -ut[1]*ut[0];  //1 
    At[6] = ut[2]* ut[2]+ut[0]*ut[0];
    At[7] = -ut[2]*ut[1];  //3
    //At[6] = At[2]; // -ut[2]*ut[0];  //2
    //At[7] = At[5]; // -ut[2]*ut[1];  //3
    At[8] = ut[1]*ut[1]+ut[0]*ut[0];
    At[9] = ut[2]*upt[1]-ut[1]*upt[2];
    At[10] = -ut[2]*upt[0]+ut[0]*upt[2];
    At[11] = ut[1]*upt[0]-ut[0]*upt[1];
  }
  elim_A_special(A); 
}




void compute_all_dets_of_A_for_elim_A( double *__restrict__  detA, const double *__restrict__  A){
  //First compute the subdets on the right hand side; 
  //This can actually be optimized since there is 
  //sparsity structure in the right hand side as well. 
  
  detA[0*15 + 0] = 0;
  detA[0*15 + 1] = 0;
  detA[0*15 + 2] = -A[57];
  detA[0*15 + 3] = -A[58]+A[33];
  detA[0*15 + 4] = A[34];
  detA[0*15 + 5] = 0;
  detA[0*15 + 6] = 2*A[45];
  detA[0*15 + 7] = -A[21]-A[59]-A[32]*A[57]+A[33]*A[56]+2*A[46];
  detA[0*15 + 8] = A[35]-A[22]-A[32]*A[58]+A[34]*A[56];
  detA[0*15 + 9] = A[45]*A[56]-A[44]*A[57];
  detA[0*15 + 10] = A[32]*A[45]-A[44]*A[33]-A[44]*A[58]+A[46]*A[56]+A[20]*A[57]-A[21]*A[56]+2*A[47];
  detA[0*15 + 11] = -A[23]+A[20]*A[58]-A[22]*A[56]-A[32]*A[59]+A[35]*A[56]+A[32]*A[46]-A[44]*A[34];
  detA[0*15 + 12] = A[47]*A[56]-A[44]*A[59]-A[20]*A[45]+A[44]*A[21];
  detA[0*15 + 13] = A[44]*A[22]+A[20]*A[59]-A[23]*A[56]-A[20]*A[46]+A[32]*A[47]-A[44]*A[35];
  detA[0*15 + 14] = A[44]*A[23]-A[20]*A[47];
  detA[1*15 + 0] = 0;
  detA[1*15 + 1] = -A[57];
  detA[1*15 + 2] = -A[58]+A[33];
  detA[1*15 + 3] = A[34];
  detA[1*15 + 4] = 0;
  detA[1*15 + 5] = A[45];
  detA[1*15 + 6] = -A[32]*A[57]-A[59]+A[46]+A[33]*A[56];
  detA[1*15 + 7] = -A[9]+A[35]-A[32]*A[58]+A[34]*A[56];
  detA[1*15 + 8] = -A[10];
  detA[1*15 + 9] = A[47]+A[32]*A[45]-A[44]*A[33];
  detA[1*15 + 10] = -A[9]*A[56]-A[32]*A[59]+A[35]*A[56]+A[32]*A[46]-A[44]*A[34]+A[8]*A[57];
  detA[1*15 + 11] = -A[11]+A[8]*A[58]-A[10]*A[56];
  detA[1*15 + 12] = -A[8]*A[45]+A[44]*A[9]+A[32]*A[47]-A[44]*A[35];
  detA[1*15 + 13] = -A[11]*A[56]-A[8]*A[46]+A[44]*A[10]+A[8]*A[59];
  detA[1*15 + 14] = -A[8]*A[47]+A[44]*A[11];
  detA[2*15 + 0] = 0;
  detA[2*15 + 1] = -A[45];
  detA[2*15 + 2] = -A[46]+A[21];
  detA[2*15 + 3] = A[22]-A[9];
  detA[2*15 + 4] = -A[10];
  detA[2*15 + 5] = A[44]*A[57]-A[45]*A[56];
  detA[2*15 + 6] = -A[47]+A[44]*A[58]-A[46]*A[56]-A[20]*A[57]+A[21]*A[56];
  detA[2*15 + 7] = A[23]-A[20]*A[58]+A[22]*A[56]+A[8]*A[57]-A[9]*A[56];
  detA[2*15 + 8] = -A[11]+A[8]*A[58]-A[10]*A[56];
  detA[2*15 + 9] = -A[47]*A[56]+A[20]*A[45]+A[44]*A[59]-A[44]*A[21];
  detA[2*15 + 10] = -A[20]*A[59]+A[23]*A[56]+A[20]*A[46]-A[44]*A[22]-A[8]*A[45]+A[44]*A[9];
  detA[2*15 + 11] = -A[11]*A[56]-A[8]*A[46]+A[44]*A[10]+A[8]*A[59];
  detA[2*15 + 12] = A[20]*A[47]-A[44]*A[23];
  detA[2*15 + 13] = -A[8]*A[47]+A[44]*A[11];
  detA[2*15 + 14] = 0;
  detA[3*15 + 0] = A[57];
  detA[3*15 + 1] = A[58]-A[33];
  detA[3*15 + 2] = -A[34];
  detA[3*15 + 3] = 0;
  detA[3*15 + 4] = 0;
  detA[3*15 + 5] = A[32]*A[57]-A[33]*A[56]+A[59]-A[21];
  detA[3*15 + 6] = A[32]*A[58]-A[34]*A[56]+2*A[9]-A[35]-A[22];
  detA[3*15 + 7] = 2*A[10];
  detA[3*15 + 8] = 0;
  detA[3*15 + 9] = -A[8]*A[57]+A[9]*A[56]+A[32]*A[59]-A[35]*A[56]+A[20]*A[33]-A[21]*A[32]-A[23];
  detA[3*15 + 10] = 2*A[11]+A[20]*A[34]-A[22]*A[32]-A[8]*A[33]+A[9]*A[32]-A[8]*A[58]+A[10]*A[56];
  detA[3*15 + 11] = -A[8]*A[34]+A[10]*A[32];
  detA[3*15 + 12] = -A[8]*A[59]+A[11]*A[56]+A[8]*A[21]-A[20]*A[9]+A[20]*A[35]-A[23]*A[32];
  detA[3*15 + 13] = -A[8]*A[35]+A[11]*A[32]+A[8]*A[22]-A[20]*A[10];
  detA[3*15 + 14] = A[8]*A[23]-A[20]*A[11];
  detA[4*15 + 0] = A[45];
  detA[4*15 + 1] = A[46]-A[21];
  detA[4*15 + 2] = -A[22]+A[9];
  detA[4*15 + 3] = A[10];
  detA[4*15 + 4] = 0;
  detA[4*15 + 5] = A[47]+A[32]*A[45]-A[44]*A[33];
  detA[4*15 + 6] = -A[23]+A[32]*A[46]-A[44]*A[34]+A[20]*A[33]-A[21]*A[32];
  detA[4*15 + 7] = A[9]*A[32]+A[11]+A[20]*A[34]-A[22]*A[32]-A[8]*A[33];
  detA[4*15 + 8] = -A[8]*A[34]+A[10]*A[32];
  detA[4*15 + 9] = -A[8]*A[45]+A[44]*A[9]+A[32]*A[47]-A[44]*A[35];
  detA[4*15 + 10] = -A[8]*A[46]+A[44]*A[10]+A[8]*A[21]-A[20]*A[9]-A[23]*A[32]+A[20]*A[35];
  detA[4*15 + 11] = -A[8]*A[35]+A[11]*A[32]+A[8]*A[22]-A[20]*A[10];
  detA[4*15 + 12] = -A[8]*A[47]+A[44]*A[11];
  detA[4*15 + 13] = A[8]*A[23]-A[20]*A[11];
  detA[4*15 + 14] = 0;  
}


inline void convkernel_3_7(double * __restrict__  p,const double * __restrict__ p1,const double * __restrict__ p2){
  p[0] = p1[0]*p2[0];
  p[1] = p1[0]*p2[1]+p1[1]*p2[0];
  p[2] = p1[0]*p2[2]+p1[1]*p2[1]+p1[2]*p2[0];
  p[3] = p1[0]*p2[3]+p1[1]*p2[2]+p1[2]*p2[1]+p1[3]*p2[0];
  p[4] = p1[0]*p2[4]+p1[1]*p2[3]+p1[2]*p2[2]+p1[3]*p2[1];
  p[5] = p1[0]*p2[5]+p1[2]*p2[3]+p1[1]*p2[4]+p1[3]*p2[2];
  p[6] = p1[3]*p2[3]+p1[1]*p2[5]+p1[0]*p2[6]+p1[2]*p2[4];
  p[7] = p1[3]*p2[4]+p1[0]*p2[7]+p1[1]*p2[6]+p1[2]*p2[5];
  p[8] = p1[2]*p2[6]+p1[1]*p2[7]+p1[3]*p2[5];
  p[9] = p1[2]*p2[7]+p1[3]*p2[6];
  p[10] = p1[3]*p2[7];
}






inline void convkernel_3_7_sub_from(double *__restrict__ p,const double *__restrict__ p1,const double * __restrict__ p2){
  p[0] -= p1[0]*p2[0];
  p[1] -= p1[0]*p2[1]+p1[1]*p2[0];
  p[2] -= p1[0]*p2[2]+p1[1]*p2[1]+p1[2]*p2[0];
  p[3] -= p1[0]*p2[3]+p1[1]*p2[2]+p1[2]*p2[1]+p1[3]*p2[0];
  p[4] -= p1[0]*p2[4]+p1[1]*p2[3]+p1[2]*p2[2]+p1[3]*p2[1];
  p[5] -= p1[0]*p2[5]+p1[2]*p2[3]+p1[1]*p2[4]+p1[3]*p2[2];
  p[6] -= p1[3]*p2[3]+p1[1]*p2[5]+p1[0]*p2[6]+p1[2]*p2[4];
  p[7] -= p1[3]*p2[4]+p1[0]*p2[7]+p1[1]*p2[6]+p1[2]*p2[5];
  p[8] -= p1[2]*p2[6]+p1[1]*p2[7]+p1[3]*p2[5];
  p[9] -= p1[2]*p2[7]+p1[3]*p2[6];
  p[10]-= p1[3]*p2[7];
}


const int detinst3[]={1*15,2*15,1*15,3*15,1*15,4*15,2*15,3*15,2*15,4*15,3*15,4*15};
inline  void compute_poly3_subdets1( double * __restrict__ sub, const double * __restrict__ B){
  int i;
  for( i=0; i< 6; i++){
    int j1= detinst3[2*i]; 
    int j2= detinst3[2*i+1]; 

    double *temp3 = &sub[i*4]; 
    convkernel_1_2(temp3,&B[j1 + 12],&B[j2 + 9]);
    convkernel_1_2_sub_from(temp3,&B[j2 + 12],&B[j1 + 9]);
  }
}

inline  void compute_poly3_subdets2( double * __restrict__ sub, const double * __restrict__ B){
  int i;
  for( i=0; i< 6; i++){
    int j1= detinst3[2*i]; 
    int j2= detinst3[2*i+1]; 
    double *temp7 = &sub[i*8]; 

    convkernel_3_4(temp7,&B[j1 + 5],&B[j2 + 0]);
    convkernel_3_4_sub_from(temp7,&B[j2 + 5],&B[j1 + 0]);
  }
}

inline  void compute_poly3_subdets( double * __restrict__ sub1,double * __restrict__ sub2, const double * __restrict__ B){
  int i;
  for( i=0; i< 6; i++){
    int j1= detinst3[2*i]; 
    const double *Bj1 = &B[j1]; 
    int j2= detinst3[2*i+1]; 
    const double *Bj2 = &B[j2]; 


    double *temp3 = &sub1[i*4]; 
    convkernel_1_2(temp3,&Bj1[ 12],&Bj2[ 9]);
    convkernel_1_2_sub_from(temp3,&Bj2 [ 12],&Bj1 [ 9]);
    

    double *temp7 = &sub2[i*8];     
    convkernel_3_4(temp7,&Bj1[ 5],&Bj2[ 0]);
    convkernel_3_4_sub_from(temp7,&Bj2[ 5],&Bj1[ 0]);
  }
}


void compute_poly3( double * __restrict__ p, double *B){
  int i,j;
  double temp_inv = 1/B[14+0*15]; 
  for( i=1; i< 5; i++){
    double * __restrict__ Brow = &B[i*15]; 
    double temp = Brow[14]*temp_inv; 
    for(j=0; j < 15; j++){
      Brow[j] -= temp*B[0*15+j]; 
    }
  }
  double subdet1[6*4]; 
  double subdet2[6*8]; 
  //compute_poly3_subdets1( subdet1, B); 
  //compute_poly3_subdets2( subdet2, B); 
  compute_poly3_subdets( subdet1,subdet2,  B); 
  

  convkernel_3_7( p, &subdet1[4*0], &subdet2[8*(5-0)]); 
  convkernel_3_7_sub_from( p,  &subdet1[4*1], &subdet2[8*(5-1)]); 
  convkernel_3_7_add_to( p, &subdet1[4*2], &subdet2[8*(5-2)]); 
  convkernel_3_7_add_to( p, &subdet1[4*3], &subdet2[8*(5-3)]); 
  convkernel_3_7_sub_from( p,  &subdet1[4*4], &subdet2[8*(5-4)]); 
  convkernel_3_7_add_to( p, &subdet1[4*5], &subdet2[8*(5-5)]);
}


/* C1 is 4x4 */
void compute_C1_red( double * __restrict__ C1, const double *__restrict__ B, double x){
  double x_vec[5];
  int i,j;
  double xpow =x;
  for(i=0;i<3;i++){
    xpow *= x;
    x_vec[i] = xpow;
  }

  for( i=0;i<3;i++){
    double temp; 
    const double *__restrict__ Brow = &B[(i+1)*15]; 
    double *__restrict__ C1row = &C1[i*4]; 

    temp = Brow[0] + x*Brow[1]; 
    for( j=2; j<5;j++)
      temp += x_vec[j-2]*Brow[0+j];
    C1row[3] = temp; 

    temp = Brow[5] + x*Brow[5+1]; 
    for( j=2; j<4;j++)
      temp += x_vec[j-2]*Brow[5+j];
    C1row[2] = temp; 

    C1row[1] = Brow[9] + x*Brow[9+1] + x_vec[0]*Brow[9+2]; 

    C1row[0] = Brow[12+0] + x*Brow[12+1];

  }
  /*  X= [1;x;x*x;x*x*x;x*x*x*x];
      C1= ([B4*X(1) B3*X(1:2) B2*X(1:3) B1*X(1:4) B0*X(1:5)]); */
}



/*We will only compue the last 2x3 block*/
void compute_ns_of_C2_from_A_elim( double *__restrict__ ns, const double  *__restrict__ A, const double *__restrict__ T){
  double t0 = T[0]; 
  double t1 = T[1]; 
  double t2 = T[2]; 

  double a1 = t1; 

  double a2 = A[14*3+2]*t2;
  double a3 = A[15*3+0]*t0+ A[15*3+1]*t1+ A[15*3+2]*t2;  

  double b1 = t2; 
  double b2 = t1+ A[18*3+2]*t2;
  double b3 = A[19*3+0]*t0+ A[19*3+1]*t1+ A[19*3+2]*t2;  
  ns[1] = a2*b3-a3*b2;
  ns[2] = a3*b1-a1*b3;
  ns[3] = a1*b2-a2*b1; 
  
  /*Here we use the fact that this region of A is quite sparse and 
    we know the sparsity pattern*/
  double mysum =  ( A[2*3+ 2]*t2)*ns[2] + (A[3*3+ 0]*t0+ A[3*3+1]*t1+ A[3*3+ 2]*t2)*ns[3];
  ns[0] = -mysum; 
  ns[1]*=t0; 
  ns[2]*=t0; 
  ns[3]*=t0; 
}

void endgame(  double *SOLS,int nr,const double *r,const double *detA, const double *A){
  int i; 
  double C1[5*5];
  for(i=0; i<nr;i++){
    int i1;
    double V[5];

    compute_C1_red( C1,detA,r[i]); 
    hs_NullSpace_d3x4(V,C1);
    SOLS[6*i+5] =V[0]/V[1];

    /*
      We are using the simplified version since we have an 
      already reduced matrix. 
      compute_C1( C1,detA,r[i]);
      vv_NullSpace_d4x5(V,C1);
      SOLS[6*i+5] =V[1]/V[2];
    */

    
    SOLS[6*i+3] =1;
    SOLS[6*i+4] =r[i];


    double V2[4];
    compute_ns_of_C2_from_A_elim( V2, A, &SOLS[3 +i*6] );
    
    /*
    double C2[20];
    compute_C2( C2, A, &SOLS[3 +i*6]);
    vv_NullSpace_d3x4(V2,C2);
    */

    double temp = 1/V2[3];
    for(i1=0;i1<3;i1++){
      SOLS[i*6 + i1] = temp*V2[i1];
    }
  }
}



int five_flow( double *SOLS, const double *xx, const double *xxp, int polish_iterations, int bracket_iterations){
  double A[3*20];
  double detA[15*5];
  double p[11];
  double r[10];


  compute_A_special( A, xx, xxp);
  compute_all_dets_of_A_for_elim_A( detA, A); 
  compute_poly3( p, detA);

  // int i_temp[1000];double d_temp[1000];
  // int nr = vv_solvepolysturmd(r,p,10,i_temp,d_temp,polish_iterations,bracket_iterations);

int nr = MY_SPECIAL_POLYSOLVER(r,p,10,polish_iterations,bracket_iterations);
 
  endgame(SOLS,nr,r,  detA,A);  
  return nr;
}






#ifdef HS_COMPILE_MEX
#include <mex.h>
void mexFunction(int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[]){
  double *xx =mxGetPr( prhs[0]);
  double *xxp =mxGetPr( prhs[1]);
  double SOLS[6*10];
  int i;

  if( nrhs ==0) {
    printf("Calling sequence is SOLS=five_flow( x,xp,polish_iterations, bracket_iterations);\n"); 
    return; 
  }

  int polish_iterations = 30; 
  int bracket_iterations = 30; 
  if( nrhs >= 4) {
    polish_iterations = mxGetScalar( prhs[2] );
    bracket_iterations = mxGetScalar( prhs[3] );
    printf("Setting polish=%i polish=%i\n",  polish_iterations,bracket_iterations);
  }


  int nr = five_flow(  SOLS, xx, xxp, polish_iterations, bracket_iterations);
  plhs[0]=mxCreateDoubleMatrix(6,nr,mxREAL);
  double *SOLSout =  mxGetPr( plhs[0] ); 
  for( i=0; i< 6*nr; i++)
    SOLSout[i] = SOLS[i];
}

#else
#include <stdlib.h>

int main(int argc, void **argv){
  double xx[5*3];
  double xxp[5*3];
  double SOLS[6*10];
  int i,j;
  double nn = 1/(float)RAND_MAX;


  int polish_iterations = 30;
  int bracket_iterations = 200;
  if( argc > 1){
    sscanf( (char *)argv[1],"%i " ,  &polish_iterations);
  }

  for( i=0;i<100000;i++){
    for(j=0; j < 15; j++){
      xx[j] = nn*(float)rand();
      xxp[j] = nn*(float)rand();
    }
    for(j=0; j < 10; j++)
      int nr = five_flow(  SOLS, xx, xxp,30,30);
  }
}
#endif 
