freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][gsoc-anurag-2023] Optimize SIMD even more


From: Anurag Thakur (@AdbhutDev)
Subject: [Git][freetype/freetype][gsoc-anurag-2023] Optimize SIMD even more
Date: Thu, 05 Oct 2023 21:40:53 +0000

Anurag Thakur pushed to branch gsoc-anurag-2023 at FreeType / FreeType

Commits:

  • 55c25257
    by Anurag Thakur at 2023-10-06T03:10:27+05:30
    Optimize SIMD even more
    

2 changed files:

Changes:

  • src/base/ftobjs.c
    ... ... @@ -3154,7 +3154,7 @@ int conic_to2(FT_GlyphSlot* slot, FT_Vector *control, FT_Vector *from, FT_Vector
    3154 3154
           face->garray = (FT_GlyphSlot*)malloc(
    
    3155 3155
               face->driver->clazz->slot_object_size * face->num_glyphs );
    
    3156 3156
           //error           = FT_Set_Char_Size( face, 0, 160 * 64, 300, 300 );
    
    3157
    -      error           = FT_Set_Pixel_Sizes( face, 0, 500);
    
    3157
    +      error           = FT_Set_Pixel_Sizes( face, 0, 100);
    
    3158 3158
           // int glyph_index = FT_Get_Char_Index( face, 'A' );
    
    3159 3159
           // error           = FT_Load_Glyph( face, glyph_index, FT_LOAD_NO_HINTING );
    
    3160 3160
     
    

  • src/dense/ftdense.c
    ... ... @@ -443,8 +443,7 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target, FT_PreLine pl
    443 443
       }
    
    444 444
       // point.x = 100;
    
    445 445
       // point.y = 100;
    
    446
    -  // dense_move_to(&point, worker);
    
    447
    -  
    
    446
    +  // dense_move_to(&point, worker);  
    
    448 447
       // Render into bitmap
    
    449 448
       const FT20D12* source = worker->m_a;
    
    450 449
       unsigned char* dest     = target->buffer;
    
    ... ... @@ -460,6 +459,7 @@ __m128i offset = _mm_setzero_si128();
    460 459
       {
    
    461 460
         // load 4 floats from source
    
    462 461
     
    
    462
    +    //printf("%d\n", source[i]);
    
    463 463
         __m128i x = _mm_load_si128( (__m128i*)&source[i] );
    
    464 464
     
    
    465 465
         x = _mm_add_epi32( x, _mm_slli_si128( x, 4 ) );
    
    ... ... @@ -481,27 +481,40 @@ __m128i offset = _mm_setzero_si128();
    481 481
         // __m128i z = _mm_cvttps_epi32(y);
    
    482 482
         // z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
    
    483 483
     
    
    484
    -    // *((int*)dest+i) = _mm_extract_epi16(z, 0);
    
    484
    +    
    
    485
    +
    
    486
    +    // int yu = ;
    
    487
    +    // *((int*)dest+i) = yu;
    
    485 488
     
    
    486 489
     
    
    487 490
         // take absolute value
    
    488
    -    __m128i y = _mm_abs_epi32( x );  // fabs(x)
    
    491
    +    //__m128i y = _mm_abs_epi32( x );  // fabs(x)
    
    492
    +
    
    489 493
     
    
    490 494
         // cap max value to 1
    
    491
    -    y = _mm_min_epi32( y, _mm_set1_epi32( 4094 ) );
    
    495
    +    //y = _mm_min_epi32( _mm_srli_epi32( y, 4 ), _mm_set1_epi32( 255 ) );
    
    496
    +    __m128i y = _mm_abs_epi32(_mm_srai_epi32(  x , 4 ));
    
    492 497
     
    
    493 498
         // reduce to 255
    
    494
    -    y = _mm_srli_epi32( y, 4 );
    
    499
    +    // y = 
    
    500
    +
    
    501
    +    // // shuffle
    
    502
    +     //y = _mm_shuffle_epi8( y, mask );
    
    503
    +    
    
    504
    +     y = _mm_packus_epi16(_mm_packs_epi32(y, nzero), nzero);
    
    505
    +    //__m128i z = _mm_packus_epi16(_mm_packs_epi32(z, nzero), nzero);
    
    495 506
     
    
    496
    -    // shuffle
    
    497
    -    y = _mm_shuffle_epi8( y, mask );
    
    507
    +    // int* ptr = (int*)&dest[i];
    
    508
    +    *(int*)&dest[i] =  *(int*)&y;
    
    509
    +    //*(int*)&dest[i] =  _mm_extract_epi32(y, 0);
    
    498 510
     
    
    499
    -    _mm_store_ss( (float*)&dest[i], _mm_castsi128_ps(y) );
    
    511
    +    //_mm_store_ss( (float*)&dest[i], _mm_castsi128_ps(y) );
    
    500 512
     
    
    501 513
         // store the current prefix sum in offset
    
    502
    -    offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
    
    503
    -                                               _mm_castsi128_ps( x ),
    
    504
    -                                               _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
    
    514
    +    // offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
    
    515
    +    //                                            _mm_castsi128_ps( x ),
    
    516
    +    //                                            _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
    
    517
    +    offset = _mm_shuffle_epi32(x,_MM_SHUFFLE( 3, 3, 3, 3 ) );
    
    505 518
         //offset = _mm_set1_epi32(_mm_extract_epi32(x, 3));
    
    506 519
       }
    
    507 520
     
    
    ... ... @@ -566,7 +579,7 @@ dense_raster_render( FT_Raster raster, const FT_Raster_Params* params )
    566 579
       worker->m_w = target_map->pitch;
    
    567 580
       worker->m_h = target_map->rows;
    
    568 581
     
    
    569
    -  int size = worker->m_w * worker->m_h + 4;
    
    582
    +  int size = (worker->m_w * worker->m_h + 3) & ~3;
    
    570 583
     
    
    571 584
       worker->m_a      = malloc( sizeof( FT20D12 ) * size );
    
    572 585
       worker->m_a_size = size;
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]