00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifndef VECTOR_INSTR_H
00018 #define VECTOR_INSTR_H
00019
00020 #ifdef DEBUG
00021 #include <assert.h>
00022 #include <iostream>
00023 using namespace std;
00024 #endif
00025
00026 #ifdef __cplusplus
00027 extern "C"{
00028 #endif
00029
00038 #if 0
00039
00040 inline void vector_cmpswap_noret(int * arg1, int * arg2){
00041 __asm__(
00042
00043 "movdqa (%0), %%xmm0\n\t"
00044 "movdqa (%1), %%xmm1\n\t"
00045
00046
00047 "pmaxsw (%1), %%xmm0\n\t"
00048 "pminsw (%0), %%xmm1\n\t"
00049
00050 "movdqa %%xmm0, (%1)\n\t"
00051 "movdqa %%xmm1, (%0)\n\t"
00052 :
00053 :"r"(arg1), "r"(arg2)
00054 :"memory", "cc","%eax"
00055 );
00056 }
00057 #endif
00058
00067 inline void vector_cmpswap_noret(int * arg1, int * arg2){
00068 __asm__(
00069
00070 "movdqa (%0), %%xmm0\n\t"
00071 "movdqa (%1), %%xmm1\n\t"
00072
00073
00074 "movdqa %%xmm0, %%xmm2\n\t"
00075
00076
00077 "pcmpgtd %%xmm1, %%xmm0\n\t"
00078 "pmovmskb %%xmm0, %%eax\n\t"
00079 "testw %%ax, %%ax\n\t"
00080 "jz 0f\n\t"
00081 "movdqa %%xmm0, %%xmm4\n\t"
00082
00083 "movdqa %%xmm1, %%xmm3\n\t"
00084 "pand %%xmm0, %%xmm1\n\t"
00085 "pandn %%xmm2, %%xmm0\n\t"
00086 "pxor %%xmm0, %%xmm1\n\t"
00087 "movdqa %%xmm1, (%0)\n\t"
00088
00089 "pand %%xmm4, %%xmm2\n\t"
00090 "pandn %%xmm3, %%xmm4\n\t"
00091 "pxor %%xmm2, %%xmm4\n\t"
00092 "movdqa %%xmm4, (%1)\n\t"
00093 "0:\n\t"
00094 :
00095 :"r"(arg1), "r"(arg2)
00096 :"memory", "cc","%eax"
00097 );
00098 }
00099
00100 inline int vector_cmpswap(int * arg1, int * arg2){
00101 #if defined(DEBUG) && defined (VERBOSE)
00102 assert(((unsigned long) arg1)%16 == 0);
00103 assert(((unsigned long) arg2)%16 == 0);
00104 cout<<"VECTOR_CMPSWAP{\n";
00105 printVector(arg1);
00106 printVector(arg2);
00107 #endif
00108 char res;
00109 int mask=0;
00110 __asm__(
00111
00112 "movdqa (%2), %%xmm0\n\t"
00113 "movdqa %%xmm0, %%xmm2\n\t"
00114
00115
00116 "movdqa (%3), %%xmm1\n\t"
00117 "movdqa %%xmm1, %%xmm3\n\t"
00118
00119
00120 "pcmpgtd %%xmm1, %%xmm0\n\t"
00121
00122 "xorb %0, %0\n\t"
00123 "pmovmskb %%xmm0, %1\n\t"
00124 "test %1, %1\n\t"
00125 "jz 1f\n\t"
00126 "movb $1, %0\n\t"
00127
00128
00129 "movdqa %%xmm0, %%xmm4\n\t"
00130
00131 "pand %%xmm0, %%xmm1\n\t"
00132 "pandn %%xmm2, %%xmm0\n\t"
00133 "pxor %%xmm0, %%xmm1\n\t"
00134 "movdqa %%xmm1, (%2)\n\t"
00135
00136 "pand %%xmm4, %%xmm2\n\t"
00137 "pandn %%xmm3, %%xmm4\n\t"
00138 "pxor %%xmm2, %%xmm4\n\t"
00139 "movdqa %%xmm4, (%3)\n\t"
00140 "1:"
00141 :"=a"(res),"=d"(mask)
00142 :"S"(arg1), "D"(arg2)
00143 :"memory", "cc"
00144 );
00145 #if defined(DEBUG) && defined (VERBOSE)
00146 printVector(arg1);
00147 printVector(arg2);
00148 for(int i=0;i<4;i++){
00149 if(arg1[i]>arg2[i]){
00150 cout<<"Error";
00151 assert(arg1[i]<=arg2[i]);
00152 }
00153 }
00154 cout <<"}END_VECTOR_CMPSWP\n";
00155 #endif
00156 return res;
00157 }
00158
00169 inline int vector_cmpswap_skew(int * arg1, int * arg2){
00170 int tail1=arg1[3];
00171
00172 arg1[3]=arg1[2];
00173 arg1[2]=arg1[1];
00174 arg1[1]=arg1[0];
00175
00176
00177
00178 arg1[0]=arg2[0];
00179
00180 int res = vector_cmpswap(arg1, arg2);
00181
00182 arg1[0]=arg1[1];
00183 arg1[1]=arg1[2];
00184 arg1[2]=arg1[3];
00185 arg1[3]=tail1;
00186
00187 return res;
00188 }
00189
00190 inline void vector_cmpswap_skew_noret(int * arg1, int * arg2){
00191 int tail1=arg1[3];
00192
00193 arg1[3]=arg1[2];
00194 arg1[2]=arg1[1];
00195 arg1[1]=arg1[0];
00196
00197
00198
00199 arg1[0]=arg2[0];
00200
00201 vector_cmpswap_noret(arg1, arg2);
00202
00203 arg1[0]=arg1[1];
00204 arg1[1]=arg1[2];
00205 arg1[2]=arg1[3];
00206 arg1[3]=tail1;
00207 }
00208
00212 inline void transpose4_4(int * array){
00213 __asm__(
00214 "movdqa (%0), %%xmm0\n\t"
00215 "movdqa %%xmm0, %%xmm4\n\t"
00216 "movdqa 16(%0), %%xmm1\n\t"
00217 "movdqa %%xmm1, %%xmm5\n\t"
00218 "movdqa 32(%0), %%xmm2\n\t"
00219 "movdqa 48(%0), %%xmm3\n\t"
00220 "punpckhdq %%xmm2, %%xmm0\n\t"
00221 "punpckldq %%xmm2, %%xmm4\n\t"
00222 "punpckhdq %%xmm3, %%xmm1\n\t"
00223 "punpckldq %%xmm3, %%xmm5\n\t"
00224
00225 "movdqa %%xmm0, %%xmm2\n\t"
00226 "movdqa %%xmm4, %%xmm3\n\t"
00227
00228 "punpckhdq %%xmm1, %%xmm0\n\t"
00229 "punpckldq %%xmm1, %%xmm2\n\t"
00230 "punpckhdq %%xmm5, %%xmm4\n\t"
00231 "punpckldq %%xmm5, %%xmm3\n\t"
00232
00233 "movdqa %%xmm3, (%0)\n\t"
00234 "movdqa %%xmm4, 16(%0)\n\t"
00235 "movdqa %%xmm2, 32(%0)\n\t"
00236 "movdqa %%xmm0, 48(%0)\n\t"
00237 :
00238 :"r"(array)
00239 :"memory", "cc"
00240 );
00241 }
00242
00248 inline void sortInVec(int * array, int vec_len){
00249 int group_len = vec_len/4;
00250 int i,j,k;
00251
00252
00253 for(i=0;i<group_len;i++){
00254 int * base = array+i*4*4;
00255
00256
00257 vector_cmpswap_noret(base, base+4);
00258 vector_cmpswap_noret(base, base+8);
00259 vector_cmpswap_noret(base, base+12);
00260
00261
00262 vector_cmpswap_noret(base+4, base+8);
00263 vector_cmpswap_noret(base+4, base+12);
00264
00265
00266 vector_cmpswap_noret(base+8, base+12);
00267
00268
00269 transpose4_4(base);
00270 }
00271
00272 for(k=group_len*4;k<vec_len;k++){
00273 for(i=0;i<3;i++){
00274 for(j=i+1;j<4;j++){
00275 if(array[4*k+i]>array[4*k+j]){
00276 int tmp = array[4*k+j];
00277 array[4*k+j] = array[4*k+i];
00278 array[4*k+i] = tmp;
00279 }
00280 }
00281 }
00282 }
00283 }
00284
00285 inline void copyVector(int * arg1, int * arg2){
00286 __asm__(
00287
00288 "movdqa (%0), %%xmm0\n\t"
00289
00290 "movdqa %%xmm0, (%1)\n\t"
00291 :
00292 :"r"(arg1), "r"(arg2)
00293 :"memory", "cc"
00294 );
00295 }
00296
00297 inline void swapVector(int * arg1, int * arg2){
00298 __asm__(
00299
00300 "movdqa (%0), %%xmm0\n\t"
00301 "movdqa (%1), %%xmm1\n\t"
00302
00303
00304
00305 "movdqa %%xmm0, (%1)\n\t"
00306 "movdqa %%xmm1, (%0)\n\t"
00307
00308 :
00309 :"r"(arg1), "r"(arg2)
00310 :"memory", "cc"
00311 );
00312 }
00313
00318 inline void vector_merge(int * arg1, int * arg2){
00319 vector_cmpswap_noret(arg1, arg2);
00320
00321 int tail = arg2[3];
00322 arg2[3]=arg2[2];
00323 arg2[2]=arg2[1];
00324 arg2[1]=arg2[0];
00325 arg2[0]=arg1[0];
00326
00327 vector_cmpswap_noret(arg1, arg2);
00328 arg2[0]=arg2[1];
00329 arg2[1]=arg2[2];
00330 arg2[2]=arg2[3];
00331 arg2[3]=tail;
00332
00333
00334 if(arg2[0]<arg1[2]){
00335 int tmp = arg1[2];
00336 arg1[2] = arg2[0];
00337 arg2[0] = tmp;
00338 }
00339 if(arg2[1]<arg1[3]){
00340 int tmp = arg1[3];
00341 arg1[3] = arg2[1];
00342 arg2[1] = tmp;
00343 }
00344 if(arg2[0]<arg1[3]){
00345 int tmp = arg1[3];
00346 arg1[3] = arg2[0];
00347 arg2[0] = tmp;
00348 }
00349 }
00350 #ifdef __cplusplus
00351 }
00352 #endif
00353 #endif