@@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
384384 .tp_methods = executor_methods ,
385385};
386386
387- static int
388- move_stubs (
389- _PyUOpInstruction * trace ,
390- int trace_length ,
391- int stubs_start ,
392- int stubs_end
393- )
394- {
395- memmove (trace + trace_length ,
396- trace + stubs_start ,
397- (stubs_end - stubs_start ) * sizeof (_PyUOpInstruction ));
398- // Patch up the jump targets
399- for (int i = 0 ; i < trace_length ; i ++ ) {
400- if (trace [i ].opcode == _POP_JUMP_IF_FALSE ||
401- trace [i ].opcode == _POP_JUMP_IF_TRUE )
402- {
403- int target = trace [i ].oparg ;
404- if (target >= stubs_start ) {
405- target += trace_length - stubs_start ;
406- trace [i ].oparg = target ;
407- }
408- }
409- }
410- return trace_length + stubs_end - stubs_start ;
411- }
412-
413387#define TRACE_STACK_SIZE 5
414388
389+ /* Returns 1 on success,
390+ * 0 if it failed to produce a worthwhile trace,
391+ * and -1 on an error.
392+ */
415393static int
416394translate_bytecode_to_trace (
417395 PyCodeObject * code ,
@@ -790,7 +768,7 @@ translate_bytecode_to_trace(
790768 }
791769 assert (code == initial_code );
792770 // Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
793- if (trace_length > 3 ) {
771+ if (trace_length > 4 ) {
794772 ADD_TO_TRACE (_EXIT_TRACE , 0 , 0 );
795773 DPRINTF (1 ,
796774 "Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n" ,
@@ -800,25 +778,8 @@ translate_bytecode_to_trace(
800778 2 * INSTR_IP (initial_instr , code ),
801779 trace_length ,
802780 buffer_size - max_length );
803- if (max_length < buffer_size ) {
804- // There are stubs
805- if (trace_length < max_length ) {
806- // There's a gap before the stubs
807- // Move the stubs back to be immediately after the main trace
808- // (which ends at trace_length)
809- DPRINTF (2 ,
810- "Moving %d stub uops back by %d\n" ,
811- buffer_size - max_length ,
812- max_length - trace_length );
813- trace_length = move_stubs (trace , trace_length , max_length , buffer_size );
814- }
815- else {
816- assert (trace_length == max_length );
817- // There's no gap
818- trace_length = buffer_size ;
819- }
820- }
821- return trace_length ;
781+ OPT_HIST (trace_length + buffer_size - max_length , trace_length_hist );
782+ return 1 ;
822783 }
823784 else {
824785 OPT_STAT_INC (trace_too_short );
@@ -838,70 +799,84 @@ translate_bytecode_to_trace(
838799#undef DPRINTF
839800}
840801
802+ #define UNSET_BIT (array , bit ) (array[(bit)>>5] &= ~(1<<((bit)&31)))
803+ #define SET_BIT (array , bit ) (array[(bit)>>5] |= (1<<((bit)&31)))
804+ #define BIT_IS_SET (array , bit ) (array[(bit)>>5] & (1<<((bit)&31)))
805+
806+ /* Count the number of used uops, and mark them in the bit vector `used`.
807+ * This can be done in a single pass using simple reachability analysis,
808+ * as there are no backward jumps.
809+ * NOPs are excluded from the count.
810+ */
841811static int
842- remove_unneeded_uops (_PyUOpInstruction * trace , int trace_length )
812+ compute_used (_PyUOpInstruction * buffer , uint32_t * used )
843813{
844- // Stage 1: Replace unneeded _SET_IP uops with NOP.
845- // Note that we don't enter stubs, those SET_IPs are needed.
846- int last_set_ip = -1 ;
847- int last_instr = 0 ;
848- bool need_ip = true;
849- for (int pc = 0 ; pc < trace_length ; pc ++ ) {
850- int opcode = trace [pc ].opcode ;
851- if (opcode == _SET_IP ) {
852- if (!need_ip && last_set_ip >= 0 ) {
853- trace [last_set_ip ].opcode = NOP ;
854- }
855- need_ip = false;
856- last_set_ip = pc ;
814+ int count = 0 ;
815+ SET_BIT (used , 0 );
816+ for (int i = 0 ; i < _Py_UOP_MAX_TRACE_LENGTH ; i ++ ) {
817+ if (!BIT_IS_SET (used , i )) {
818+ continue ;
857819 }
858- else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE ) {
859- last_instr = pc + 1 ;
860- break ;
820+ count ++ ;
821+ int opcode = buffer [i ].opcode ;
822+ if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE ) {
823+ continue ;
861824 }
862- else {
863- // If opcode has ERROR or DEOPT, set need_ip to true
864- if (_PyOpcode_opcode_metadata [opcode ].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG ) || opcode == _PUSH_FRAME ) {
865- need_ip = true;
866- }
825+ /* All other micro-ops fall through, so i+1 is reachable */
826+ SET_BIT (used , i + 1 );
827+ switch (opcode ) {
828+ case NOP :
829+ /* Don't count NOPs as used */
830+ count -- ;
831+ UNSET_BIT (used , i );
832+ break ;
833+ case _POP_JUMP_IF_FALSE :
834+ case _POP_JUMP_IF_TRUE :
835+ /* Mark target as reachable */
836+ SET_BIT (used , buffer [i ].oparg );
867837 }
868838 }
869- // Stage 2: Squash NOP opcodes (pre-existing or set above).
870- int dest = 0 ;
871- for (int pc = 0 ; pc < last_instr ; pc ++ ) {
872- int opcode = trace [pc ].opcode ;
873- if (opcode != NOP ) {
874- if (pc != dest ) {
875- trace [dest ] = trace [pc ];
876- }
877- dest ++ ;
878- }
839+ return count ;
840+ }
841+
842+ /* Makes an executor from a buffer of uops.
843+ * Account for the buffer having gaps and NOPs by computing a "used"
844+ * bit vector and only copying the used uops. Here "used" means reachable
845+ * and not a NOP.
846+ */
847+ static _PyExecutorObject *
848+ make_executor_from_uops (_PyUOpInstruction * buffer , _PyBloomFilter * dependencies )
849+ {
850+ uint32_t used [(_Py_UOP_MAX_TRACE_LENGTH + 31 )/32 ] = { 0 };
851+ int length = compute_used (buffer , used );
852+ _PyUOpExecutorObject * executor = PyObject_NewVar (_PyUOpExecutorObject , & _PyUOpExecutor_Type , length );
853+ if (executor == NULL ) {
854+ return NULL ;
879855 }
880- // Stage 3: Move the stubs back.
881- if (dest < last_instr ) {
882- int new_trace_length = move_stubs (trace , dest , last_instr , trace_length );
883- #ifdef Py_DEBUG
884- char * python_lltrace = Py_GETENV ("PYTHON_LLTRACE" );
885- int lltrace = 0 ;
886- if (python_lltrace != NULL && * python_lltrace >= '0' ) {
887- lltrace = * python_lltrace - '0' ; // TODO: Parse an int and all that
856+ int dest = length - 1 ;
857+ /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
858+ for (int i = _Py_UOP_MAX_TRACE_LENGTH - 1 ; i >= 0 ; i -- ) {
859+ if (!BIT_IS_SET (used , i )) {
860+ continue ;
888861 }
889- if (lltrace >= 2 ) {
890- printf ("Optimized trace (length %d+%d = %d, saved %d):\n" ,
891- dest , trace_length - last_instr , new_trace_length ,
892- trace_length - new_trace_length );
893- for (int pc = 0 ; pc < new_trace_length ; pc ++ ) {
894- printf ("%4d: (%s, %d, %" PRIu64 ")\n" ,
895- pc ,
896- uop_name (trace [pc ].opcode ),
897- (trace [pc ].oparg ),
898- (uint64_t )(trace [pc ].operand ));
899- }
862+ executor -> trace [dest ] = buffer [i ];
863+ int opcode = buffer [i ].opcode ;
864+ if (opcode == _POP_JUMP_IF_FALSE ||
865+ opcode == _POP_JUMP_IF_TRUE )
866+ {
867+ /* The oparg of the target will already have been set to its new offset */
868+ int oparg = executor -> trace [dest ].oparg ;
869+ executor -> trace [dest ].oparg = buffer [oparg ].oparg ;
900870 }
901- #endif
902- trace_length = new_trace_length ;
871+ /* Set the oparg to be the destination offset,
872+ * so that we can set the oparg of earlier jumps correctly. */
873+ buffer [i ].oparg = dest ;
874+ dest -- ;
903875 }
904- return trace_length ;
876+ assert (dest == -1 );
877+ executor -> base .execute = _PyUopExecute ;
878+ _Py_ExecutorInit ((_PyExecutorObject * )executor , dependencies );
879+ return (_PyExecutorObject * )executor ;
905880}
906881
907882static int
@@ -914,28 +889,26 @@ uop_optimize(
914889{
915890 _PyBloomFilter dependencies ;
916891 _Py_BloomFilter_Init (& dependencies );
917- _PyUOpInstruction trace [_Py_UOP_MAX_TRACE_LENGTH ];
918- int trace_length = translate_bytecode_to_trace (code , instr , trace , _Py_UOP_MAX_TRACE_LENGTH , & dependencies );
919- if (trace_length <= 0 ) {
892+ _PyUOpInstruction buffer [_Py_UOP_MAX_TRACE_LENGTH ];
893+ int err = translate_bytecode_to_trace (code , instr , buffer , _Py_UOP_MAX_TRACE_LENGTH , & dependencies );
894+ if (err <= 0 ) {
920895 // Error or nothing translated
921- return trace_length ;
896+ return err ;
922897 }
923- OPT_HIST (trace_length , trace_length_hist );
924898 OPT_STAT_INC (traces_created );
925899 char * uop_optimize = Py_GETENV ("PYTHONUOPSOPTIMIZE" );
926- if (uop_optimize != NULL && * uop_optimize > '0' ) {
927- trace_length = _Py_uop_analyze_and_optimize (code , trace , trace_length , curr_stackentries );
900+ if (uop_optimize == NULL || * uop_optimize > '0' ) {
901+ err = _Py_uop_analyze_and_optimize (code , buffer , _Py_UOP_MAX_TRACE_LENGTH , curr_stackentries );
902+ if (err < 0 ) {
903+ return -1 ;
904+ }
928905 }
929- trace_length = remove_unneeded_uops (trace , trace_length );
930- _PyUOpExecutorObject * executor = PyObject_NewVar (_PyUOpExecutorObject , & _PyUOpExecutor_Type , trace_length );
906+ _PyExecutorObject * executor = make_executor_from_uops (buffer , & dependencies );
931907 if (executor == NULL ) {
932908 return -1 ;
933909 }
934- OPT_HIST (trace_length , optimized_trace_length_hist );
935- executor -> base .execute = _PyUopExecute ;
936- memcpy (executor -> trace , trace , trace_length * sizeof (_PyUOpInstruction ));
937- _Py_ExecutorInit ((_PyExecutorObject * )executor , & dependencies );
938- * exec_ptr = (_PyExecutorObject * )executor ;
910+ OPT_HIST (Py_SIZE (executor ), optimized_trace_length_hist );
911+ * exec_ptr = executor ;
939912 return 1 ;
940913}
941914
0 commit comments