freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][gsoc-anurag-2023] Add ARM NEON support


From: Anurag Thakur (@AdbhutDev)
Subject: [Git][freetype/freetype][gsoc-anurag-2023] Add ARM NEON support
Date: Sun, 08 Oct 2023 01:10:08 +0000

Anurag Thakur pushed to branch gsoc-anurag-2023 at FreeType / FreeType

Commits:

  • 6e79a1ee
    by Anurag Thakur at 2023-10-08T06:39:26+05:30
    Add ARM NEON support
    

4 changed files:

Changes:

  • src/base/ftobjs.c
    ... ... @@ -3154,7 +3154,7 @@ int conic_to2(FT_GlyphSlot* slot, FT_Vector *control, FT_Vector *from, FT_Vector
    3154 3154
           face->garray = (FT_GlyphSlot*)malloc(
    
    3155 3155
               face->driver->clazz->slot_object_size * face->num_glyphs );
    
    3156 3156
           //error           = FT_Set_Char_Size( face, 0, 160 * 64, 300, 300 );
    
    3157
    -      error           = FT_Set_Pixel_Sizes( face, 0, 500);
    
    3157
    +      error           = FT_Set_Pixel_Sizes( face, 0, 100);
    
    3158 3158
           // int glyph_index = FT_Get_Char_Index( face, 'A' );
    
    3159 3159
           // error           = FT_Load_Glyph( face, glyph_index, FT_LOAD_NO_HINTING );
    
    3160 3160
     
    

  • src/dense/ftdense.c
    ... ... @@ -23,11 +23,18 @@
    23 23
     #  define FT_SSE4_1 0
    
    24 24
     #endif
    
    25 25
     
    
    26
    +#if defined(__ARM_NEON)
    
    27
    +#define FT_NEON 1
    
    28
    +else 
    
    29
    +#define FT_NEON 0
    
    30
    +#endif
    
    26 31
     
    
    27 32
     #if FT_SSE4_1
    
    28 33
     
    
    29 34
         #include <immintrin.h>
    
    30 35
     
    
    36
    +#elif FT_NEON
    
    37
    +    #include <arm_neon.h>
    
    31 38
     #endif
    
    32 39
     
    
    33 40
     #define PIXEL_BITS 8
    
    ... ... @@ -91,7 +98,6 @@ dense_render_line( dense_worker* worker, FT_Pos fromx, FT_Pos fromy, FT_Pos tox,
    91 98
       return;
    
    92 99
     }
    
    93 100
     
    
    94
    -
    
    95 101
     void
    
    96 102
     dense_render_line2( dense_worker* worker, FT_PreLine pl )
    
    97 103
     {
    
    ... ... @@ -212,8 +218,9 @@ dense_render_line2( dense_worker* worker, FT_PreLine pl )
    212 218
           if ( x1i <= x0i + 1 )
    
    213 219
           {
    
    214 220
             FT26D6 xmf = ( ( x + xnext )>>1) - x0floor;
    
    215
    -        m_a[linestart + x0i] += d * ((1<<6) - xmf);
    
    216
    -        m_a[linestart + ( x0i + 1 )] += d * xmf;
    
    221
    +        FT20D12 dxmf = d*xmf;
    
    222
    +        m_a[linestart + x0i] += (d * 64) - dxmf;
    
    223
    +        m_a[linestart + ( x0i + 1 )] += dxmf;
    
    217 224
           }
    
    218 225
           else
    
    219 226
           {
    
    ... ... @@ -252,6 +259,8 @@ dense_render_line2( dense_worker* worker, FT_PreLine pl )
    252 259
           x = xnext;
    
    253 260
         }
    
    254 261
       }
    
    262
    + 
    
    263
    + 
    
    255 264
     }
    
    256 265
     
    
    257 266
     
    
    ... ... @@ -456,75 +465,62 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target, FT_PreLine pl
    456 465
     
    
    457 466
     #if FT_SSE4_1
    
    458 467
     
    
    459
    -__m128i offset = _mm_setzero_si128();
    
    460
    -  __m128i mask   = _mm_set1_epi32( 0x0c080400 );
    
    468
    +  __m128i offset = _mm_setzero_si128();
    
    461 469
       __m128i nzero = _mm_castps_si128(_mm_set1_ps(-0.0));
    
    462 470
     
    
    463 471
       for (int i = 0; i < worker->m_h*worker->m_w; i += 4)
    
    464 472
       {
    
    465 473
         // load 4 floats from source
    
    466 474
     
    
    467
    -    //printf("%d\n", source[i]);
    
    468 475
         __m128i x = _mm_load_si128( (__m128i*)&source[i] );
    
    469 476
     
    
    470 477
         x = _mm_add_epi32( x, _mm_slli_si128( x, 4 ) );
    
    471 478
     
    
    472
    -    // x = _mm_add_epi32(
    
    473
    -    //     x, _mm_castps_si128( _mm_shuffle_ps( _mm_setzero_ps(),
    
    474
    -    //                                          _mm_castsi128_ps( x ), 0x40 ) ) );
    
    475 479
         x = _mm_add_epi32(x, _mm_slli_si128(x,8));
    
    476 480
     
    
    477 481
         // add the prefsum of previous 4 floats to all current floats
    
    478 482
         x = _mm_add_epi32( x, offset );
    
    479 483
     
    
    484
    +    __m128i y = _mm_srli_epi32( _mm_abs_epi32( x) , 4 );
    
    485
    +     y = _mm_packs_epi32(y, nzero);
    
    486
    +     y = _mm_packus_epi16(y, nzero);
    
    480 487
     
    
    488
    +    // int* ptr = (int*)&dest[i];
    
    489
    +    _mm_storeu_si32(&dest[i], y);
    
    481 490
     
    
    482
    -    // __m128 y = _mm_mul_ps(_mm_castsi128_ps(x), _mm_set1_ps(255.9));
    
    483
    -
    
    484
    -    // y = _mm_andnot_ps(_mm_castsi128_ps(nzero), y);
    
    491
    +    offset = _mm_shuffle_epi32(x,_MM_SHUFFLE( 3, 3, 3, 3 ) );
    
    492
    +  }
    
    485 493
     
    
    486
    -    // __m128i z = _mm_cvttps_epi32(y);
    
    487
    -    // z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
    
    494
    +#elif FT_NEON
    
    488 495
     
    
    489
    -    
    
    496
    +  int32x4_t offset = vdupq_n_s32(0);
    
    497
    +  int32x4_t nzero =  vreinterpretq_s32_f32(vdupq_n_f32(-0.0));
    
    490 498
     
    
    491
    -    // int yu = ;
    
    492
    -    // *((int*)dest+i) = yu;
    
    499
    +  for (int i = 0; i < worker->m_h*worker->m_w; i += 4)
    
    500
    +  {
    
    501
    +    // load 4 floats from source
    
    493 502
     
    
    503
    +    int32x4_t x = vld1q_s32( (int32_t*)&source[i] );
    
    494 504
     
    
    495
    -    // take absolute value
    
    496
    -    //__m128i y = _mm_abs_epi32( x );  // fabs(x)
    
    505
    +    x = vaddq_s32( x, vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32( x), 12) ));
    
    497 506
     
    
    507
    +    x = vaddq_s32(x, vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(x), 8)));
    
    498 508
     
    
    499
    -    // cap max value to 1
    
    500
    -    //y = _mm_min_epi32( _mm_srli_epi32( y, 4 ), _mm_set1_epi32( 255 ) );
    
    501
    -    //__m128i y = _mm_abs_epi32(_mm_srai_epi32(  x , 4 ));
    
    502
    -    __m128i y = _mm_srli_epi32( _mm_abs_epi32( x) , 4 );
    
    509
    +    // add the prefsum of previous 4 floats to all current floats
    
    510
    +    x = vaddq_s32( x, offset );
    
    503 511
     
    
    504
    -    // reduce to 255
    
    505
    -    // y = 
    
    512
    +    int32x4_t y = vshrq_n_s32( vabsq_s32( x) , 4 );
    
    513
    +    y = vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(y), vqmovn_s32(nzero)));
    
    514
    +    y = vreinterpretq_s32_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(y)), vqmovun_s16(vreinterpretq_s16_s32(nzero))));
    
    506 515
     
    
    507
    -    // // shuffle
    
    508
    -     //y = _mm_shuffle_epi8( y, mask );
    
    509
    -    
    
    510
    -     y = _mm_packus_epi16(_mm_packs_epi32(y, nzero), nzero);
    
    511
    -    //__m128i z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
    
    516
    +     //y = vreinterpretq_u32_u8(vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_u32(y)), vqmovn_s32(nzero))), vqmovun_s16(vreinterpretq_s16_s32(nzero))));
    
    512 517
     
    
    513 518
         // int* ptr = (int*)&dest[i];
    
    514
    -    _mm_storeu_si32(&dest[i], y);
    
    515
    -    //*(int*)&dest[i] =  *(int*)&y;
    
    516
    -    //*(int*)&dest[i] =  _mm_extract_epi32(y, 0);
    
    517
    -
    
    518
    -    //_mm_store_ss( (float*)&dest[i], _mm_castsi128_ps(y) );
    
    519
    +    
    
    520
    +    vst1q_s32(&dest[i], y);
    
    519 521
     
    
    520
    -    // store the current prefix sum in offset
    
    521
    -    // offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
    
    522
    -    //                                            _mm_castsi128_ps( x ),
    
    523
    -    //                                            _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
    
    524
    -    offset = _mm_shuffle_epi32(x,_MM_SHUFFLE( 3, 3, 3, 3 ) );
    
    525
    -    //offset = _mm_set1_epi32(_mm_extract_epi32(x, 3));
    
    522
    +    offset = vdupq_laneq_s32(x,3 );
    
    526 523
       }
    
    527
    -
    
    528 524
     #else /* FT_SSE4_1 */
    
    529 525
     
    
    530 526
       FT20D12 value = 0;
    
    ... ... @@ -534,7 +530,7 @@ __m128i offset = _mm_setzero_si128();
    534 530
         value += *source++;
    
    535 531
     
    
    536 532
         if(value > 0){
    
    537
    -      int n = value >>4;_Pos fromx, FT_Pos fromy, FT_Pos tox, FT_Pos toy
    
    533
    +      int n = value >>4;
    
    538 534
     
    
    539 535
           if(n>255)n=255;
    
    540 536
           *dest = (unsigned char)n;
    
    ... ... @@ -588,10 +584,10 @@ dense_raster_render( FT_Raster raster, const FT_Raster_Params* params )
    588 584
     
    
    589 585
       int size = (worker->m_w * worker->m_h + 3) & ~3;
    
    590 586
     
    
    591
    -  worker->m_a      = malloc( sizeof( FT20D12 ) * size );
    
    587
    +  worker->m_a      = calloc( size, sizeof( FT20D12 ));
    
    592 588
       worker->m_a_size = size;
    
    593 589
     
    
    594
    -  memset( worker->m_a, 0, ( sizeof( FT20D12 ) * size ) );
    
    590
    +  //memset( worker->m_a, 0, ( sizeof( FT20D12 ) * size ) );
    
    595 591
       /* exit if nothing to do */
    
    596 592
       if ( worker->m_w <= worker->m_origin_x || worker->m_h <= worker->m_origin_y )
    
    597 593
       {
    

  • src/dense/ftdense.h
    ... ... @@ -26,7 +26,7 @@ extern "C"
    26 26
       typedef struct
    
    27 27
       {
    
    28 28
         /** The array used to store signed area differences. */
    
    29
    -    float* m_a;
    
    29
    +    FT20D12* m_a;
    
    30 30
         /** The number of elements in m_a. */
    
    31 31
         int m_a_size;
    
    32 32
         /** The width of the current raster in pixels. */
    

  • src/dense/ftdenserend.c
    ... ... @@ -140,7 +140,7 @@
    140 140
     
    
    141 141
     
    
    142 142
         /* allocate new one */
    
    143
    -    if ( FT_ALLOC_MULT( bitmap->buffer, bitmap->rows, bitmap->pitch ) )
    
    143
    +    if ( FT_ALLOC_MULT( bitmap->buffer,1, bitmap->rows* bitmap->pitch +16) ) // +16 is for alignment for SIMD
    
    144 144
           goto Exit;
    
    145 145
     
    
    146 146
         slot->internal->flags |= FT_GLYPH_OWN_BITMAP;
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]