... |
... |
@@ -443,8 +443,7 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target, FT_PreLine pl |
443
|
443
|
}
|
444
|
444
|
// point.x = 100;
|
445
|
445
|
// point.y = 100;
|
446
|
|
- // dense_move_to(&point, worker);
|
447
|
|
-
|
|
446
|
+ // dense_move_to(&point, worker);
|
448
|
447
|
// Render into bitmap
|
449
|
448
|
const FT20D12* source = worker->m_a;
|
450
|
449
|
unsigned char* dest = target->buffer;
|
... |
... |
@@ -460,6 +459,7 @@ __m128i offset = _mm_setzero_si128(); |
460
|
459
|
{
|
461
|
460
|
// load 4 floats from source
|
462
|
461
|
|
|
462
|
+ //printf("%d\n", source[i]);
|
463
|
463
|
__m128i x = _mm_load_si128( (__m128i*)&source[i] );
|
464
|
464
|
|
465
|
465
|
x = _mm_add_epi32( x, _mm_slli_si128( x, 4 ) );
|
... |
... |
@@ -481,27 +481,40 @@ __m128i offset = _mm_setzero_si128(); |
481
|
481
|
// __m128i z = _mm_cvttps_epi32(y);
|
482
|
482
|
// z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
|
483
|
483
|
|
484
|
|
- // *((int*)dest+i) = _mm_extract_epi16(z, 0);
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+ // int yu = ;
|
|
487
|
+ // *((int*)dest+i) = yu;
|
485
|
488
|
|
486
|
489
|
|
487
|
490
|
// take absolute value
|
488
|
|
- __m128i y = _mm_abs_epi32( x ); // fabs(x)
|
|
491
|
+ //__m128i y = _mm_abs_epi32( x ); // fabs(x)
|
|
492
|
+
|
489
|
493
|
|
490
|
494
|
// cap max value to 1
|
491
|
|
- y = _mm_min_epi32( y, _mm_set1_epi32( 4094 ) );
|
|
495
|
+ //y = _mm_min_epi32( _mm_srli_epi32( y, 4 ), _mm_set1_epi32( 255 ) );
|
|
496
|
+ __m128i y = _mm_abs_epi32(_mm_srai_epi32( x , 4 ));
|
492
|
497
|
|
493
|
498
|
// reduce to 255
|
494
|
|
- y = _mm_srli_epi32( y, 4 );
|
|
499
|
+ // y =
|
|
500
|
+
|
|
501
|
+ // // shuffle
|
|
502
|
+ //y = _mm_shuffle_epi8( y, mask );
|
|
503
|
+
|
|
504
|
+ y = _mm_packus_epi16(_mm_packs_epi32(y, nzero), nzero);
|
|
505
|
+ //__m128i z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
|
495
|
506
|
|
496
|
|
- // shuffle
|
497
|
|
- y = _mm_shuffle_epi8( y, mask );
|
|
507
|
+ // int* ptr = (int*)&dest[i];
|
|
508
|
+ *(int*)&dest[i] = *(int*)&y;
|
|
509
|
+ //*(int*)&dest[i] = _mm_extract_epi32(y, 0);
|
498
|
510
|
|
499
|
|
- _mm_store_ss( (float*)&dest[i], _mm_castsi128_ps(y) );
|
|
511
|
+ //_mm_store_ss( (float*)&dest[i], _mm_castsi128_ps(y) );
|
500
|
512
|
|
501
|
513
|
// store the current prefix sum in offset
|
502
|
|
- offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
|
503
|
|
- _mm_castsi128_ps( x ),
|
504
|
|
- _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
|
|
514
|
+ // offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
|
|
515
|
+ // _mm_castsi128_ps( x ),
|
|
516
|
+ // _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
|
|
517
|
+ offset = _mm_shuffle_epi32(x,_MM_SHUFFLE( 3, 3, 3, 3 ) );
|
505
|
518
|
//offset = _mm_set1_epi32(_mm_extract_epi32(x, 3));
|
506
|
519
|
}
|
507
|
520
|
|
... |
... |
@@ -566,7 +579,7 @@ dense_raster_render( FT_Raster raster, const FT_Raster_Params* params ) |
566
|
579
|
worker->m_w = target_map->pitch;
|
567
|
580
|
worker->m_h = target_map->rows;
|
568
|
581
|
|
569
|
|
- int size = worker->m_w * worker->m_h + 4;
|
|
582
|
+ int size = (worker->m_w * worker->m_h + 3) & ~3;
|
570
|
583
|
|
571
|
584
|
worker->m_a = malloc( sizeof( FT20D12 ) * size );
|
572
|
585
|
worker->m_a_size = size;
|