@@ -161,6 +161,7 @@ struct io_ring_ctx {
161161 * manipulate the list, hence no extra locking is needed there.
162162 */
163163 struct list_head poll_list ;
164+ struct list_head cancel_list ;
164165 } ____cacheline_aligned_in_smp ;
165166
166167#if defined(CONFIG_UNIX )
@@ -176,8 +177,20 @@ struct sqe_submit {
176177 bool needs_fixed_file ;
177178};
178179
180+ struct io_poll_iocb {
181+ struct file * file ;
182+ struct wait_queue_head * head ;
183+ __poll_t events ;
184+ bool woken ;
185+ bool canceled ;
186+ struct wait_queue_entry wait ;
187+ };
188+
179189struct io_kiocb {
180- struct kiocb rw ;
190+ union {
191+ struct kiocb rw ;
192+ struct io_poll_iocb poll ;
193+ };
181194
182195 struct sqe_submit submit ;
183196
@@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
261274 init_waitqueue_head (& ctx -> wait );
262275 spin_lock_init (& ctx -> completion_lock );
263276 INIT_LIST_HEAD (& ctx -> poll_list );
277+ INIT_LIST_HEAD (& ctx -> cancel_list );
264278 return ctx ;
265279}
266280
@@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
10581072 return 0 ;
10591073}
10601074
1075+ static void io_poll_remove_one (struct io_kiocb * req )
1076+ {
1077+ struct io_poll_iocb * poll = & req -> poll ;
1078+
1079+ spin_lock (& poll -> head -> lock );
1080+ WRITE_ONCE (poll -> canceled , true);
1081+ if (!list_empty (& poll -> wait .entry )) {
1082+ list_del_init (& poll -> wait .entry );
1083+ queue_work (req -> ctx -> sqo_wq , & req -> work );
1084+ }
1085+ spin_unlock (& poll -> head -> lock );
1086+
1087+ list_del_init (& req -> list );
1088+ }
1089+
1090+ static void io_poll_remove_all (struct io_ring_ctx * ctx )
1091+ {
1092+ struct io_kiocb * req ;
1093+
1094+ spin_lock_irq (& ctx -> completion_lock );
1095+ while (!list_empty (& ctx -> cancel_list )) {
1096+ req = list_first_entry (& ctx -> cancel_list , struct io_kiocb ,list );
1097+ io_poll_remove_one (req );
1098+ }
1099+ spin_unlock_irq (& ctx -> completion_lock );
1100+ }
1101+
1102+ /*
1103+ * Find a running poll command that matches one specified in sqe->addr,
1104+ * and remove it if found.
1105+ */
1106+ static int io_poll_remove (struct io_kiocb * req , const struct io_uring_sqe * sqe )
1107+ {
1108+ struct io_ring_ctx * ctx = req -> ctx ;
1109+ struct io_kiocb * poll_req , * next ;
1110+ int ret = - ENOENT ;
1111+
1112+ if (unlikely (req -> ctx -> flags & IORING_SETUP_IOPOLL ))
1113+ return - EINVAL ;
1114+ if (sqe -> ioprio || sqe -> off || sqe -> len || sqe -> buf_index ||
1115+ sqe -> poll_events )
1116+ return - EINVAL ;
1117+
1118+ spin_lock_irq (& ctx -> completion_lock );
1119+ list_for_each_entry_safe (poll_req , next , & ctx -> cancel_list , list ) {
1120+ if (READ_ONCE (sqe -> addr ) == poll_req -> user_data ) {
1121+ io_poll_remove_one (poll_req );
1122+ ret = 0 ;
1123+ break ;
1124+ }
1125+ }
1126+ spin_unlock_irq (& ctx -> completion_lock );
1127+
1128+ io_cqring_add_event (req -> ctx , sqe -> user_data , ret , 0 );
1129+ io_free_req (req );
1130+ return 0 ;
1131+ }
1132+
1133+ static void io_poll_complete (struct io_kiocb * req , __poll_t mask )
1134+ {
1135+ io_cqring_add_event (req -> ctx , req -> user_data , mangle_poll (mask ), 0 );
1136+ io_fput (req );
1137+ io_free_req (req );
1138+ }
1139+
1140+ static void io_poll_complete_work (struct work_struct * work )
1141+ {
1142+ struct io_kiocb * req = container_of (work , struct io_kiocb , work );
1143+ struct io_poll_iocb * poll = & req -> poll ;
1144+ struct poll_table_struct pt = { ._key = poll -> events };
1145+ struct io_ring_ctx * ctx = req -> ctx ;
1146+ __poll_t mask = 0 ;
1147+
1148+ if (!READ_ONCE (poll -> canceled ))
1149+ mask = vfs_poll (poll -> file , & pt ) & poll -> events ;
1150+
1151+ /*
1152+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
1153+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
1154+ * synchronize with them. In the cancellation case the list_del_init
1155+ * itself is not actually needed, but harmless so we keep it in to
1156+ * avoid further branches in the fast path.
1157+ */
1158+ spin_lock_irq (& ctx -> completion_lock );
1159+ if (!mask && !READ_ONCE (poll -> canceled )) {
1160+ add_wait_queue (poll -> head , & poll -> wait );
1161+ spin_unlock_irq (& ctx -> completion_lock );
1162+ return ;
1163+ }
1164+ list_del_init (& req -> list );
1165+ spin_unlock_irq (& ctx -> completion_lock );
1166+
1167+ io_poll_complete (req , mask );
1168+ }
1169+
1170+ static int io_poll_wake (struct wait_queue_entry * wait , unsigned mode , int sync ,
1171+ void * key )
1172+ {
1173+ struct io_poll_iocb * poll = container_of (wait , struct io_poll_iocb ,
1174+ wait );
1175+ struct io_kiocb * req = container_of (poll , struct io_kiocb , poll );
1176+ struct io_ring_ctx * ctx = req -> ctx ;
1177+ __poll_t mask = key_to_poll (key );
1178+
1179+ poll -> woken = true;
1180+
1181+ /* for instances that support it check for an event match first: */
1182+ if (mask ) {
1183+ unsigned long flags ;
1184+
1185+ if (!(mask & poll -> events ))
1186+ return 0 ;
1187+
1188+ /* try to complete the iocb inline if we can: */
1189+ if (spin_trylock_irqsave (& ctx -> completion_lock , flags )) {
1190+ list_del (& req -> list );
1191+ spin_unlock_irqrestore (& ctx -> completion_lock , flags );
1192+
1193+ list_del_init (& poll -> wait .entry );
1194+ io_poll_complete (req , mask );
1195+ return 1 ;
1196+ }
1197+ }
1198+
1199+ list_del_init (& poll -> wait .entry );
1200+ queue_work (ctx -> sqo_wq , & req -> work );
1201+ return 1 ;
1202+ }
1203+
1204+ struct io_poll_table {
1205+ struct poll_table_struct pt ;
1206+ struct io_kiocb * req ;
1207+ int error ;
1208+ };
1209+
1210+ static void io_poll_queue_proc (struct file * file , struct wait_queue_head * head ,
1211+ struct poll_table_struct * p )
1212+ {
1213+ struct io_poll_table * pt = container_of (p , struct io_poll_table , pt );
1214+
1215+ if (unlikely (pt -> req -> poll .head )) {
1216+ pt -> error = - EINVAL ;
1217+ return ;
1218+ }
1219+
1220+ pt -> error = 0 ;
1221+ pt -> req -> poll .head = head ;
1222+ add_wait_queue (head , & pt -> req -> poll .wait );
1223+ }
1224+
1225+ static int io_poll_add (struct io_kiocb * req , const struct io_uring_sqe * sqe )
1226+ {
1227+ struct io_poll_iocb * poll = & req -> poll ;
1228+ struct io_ring_ctx * ctx = req -> ctx ;
1229+ struct io_poll_table ipt ;
1230+ unsigned flags ;
1231+ __poll_t mask ;
1232+ u16 events ;
1233+ int fd ;
1234+
1235+ if (unlikely (req -> ctx -> flags & IORING_SETUP_IOPOLL ))
1236+ return - EINVAL ;
1237+ if (sqe -> addr || sqe -> ioprio || sqe -> off || sqe -> len || sqe -> buf_index )
1238+ return - EINVAL ;
1239+
1240+ INIT_WORK (& req -> work , io_poll_complete_work );
1241+ events = READ_ONCE (sqe -> poll_events );
1242+ poll -> events = demangle_poll (events ) | EPOLLERR | EPOLLHUP ;
1243+
1244+ flags = READ_ONCE (sqe -> flags );
1245+ fd = READ_ONCE (sqe -> fd );
1246+
1247+ if (flags & IOSQE_FIXED_FILE ) {
1248+ if (unlikely (!ctx -> user_files || fd >= ctx -> nr_user_files ))
1249+ return - EBADF ;
1250+ poll -> file = ctx -> user_files [fd ];
1251+ req -> flags |= REQ_F_FIXED_FILE ;
1252+ } else {
1253+ poll -> file = fget (fd );
1254+ }
1255+ if (unlikely (!poll -> file ))
1256+ return - EBADF ;
1257+
1258+ poll -> head = NULL ;
1259+ poll -> woken = false;
1260+ poll -> canceled = false;
1261+
1262+ ipt .pt ._qproc = io_poll_queue_proc ;
1263+ ipt .pt ._key = poll -> events ;
1264+ ipt .req = req ;
1265+ ipt .error = - EINVAL ; /* same as no support for IOCB_CMD_POLL */
1266+
1267+ /* initialized the list so that we can do list_empty checks */
1268+ INIT_LIST_HEAD (& poll -> wait .entry );
1269+ init_waitqueue_func_entry (& poll -> wait , io_poll_wake );
1270+
1271+ /* one for removal from waitqueue, one for this function */
1272+ refcount_set (& req -> refs , 2 );
1273+
1274+ mask = vfs_poll (poll -> file , & ipt .pt ) & poll -> events ;
1275+ if (unlikely (!poll -> head )) {
1276+ /* we did not manage to set up a waitqueue, done */
1277+ goto out ;
1278+ }
1279+
1280+ spin_lock_irq (& ctx -> completion_lock );
1281+ spin_lock (& poll -> head -> lock );
1282+ if (poll -> woken ) {
1283+ /* wake_up context handles the rest */
1284+ mask = 0 ;
1285+ ipt .error = 0 ;
1286+ } else if (mask || ipt .error ) {
1287+ /* if we get an error or a mask we are done */
1288+ WARN_ON_ONCE (list_empty (& poll -> wait .entry ));
1289+ list_del_init (& poll -> wait .entry );
1290+ } else {
1291+ /* actually waiting for an event */
1292+ list_add_tail (& req -> list , & ctx -> cancel_list );
1293+ }
1294+ spin_unlock (& poll -> head -> lock );
1295+ spin_unlock_irq (& ctx -> completion_lock );
1296+
1297+ out :
1298+ if (unlikely (ipt .error )) {
1299+ if (!(flags & IOSQE_FIXED_FILE ))
1300+ fput (poll -> file );
1301+ /*
1302+ * Drop one of our refs to this req, __io_submit_sqe() will
1303+ * drop the other one since we're returning an error.
1304+ */
1305+ io_free_req (req );
1306+ return ipt .error ;
1307+ }
1308+
1309+ if (mask )
1310+ io_poll_complete (req , mask );
1311+ io_free_req (req );
1312+ return 0 ;
1313+ }
1314+
10611315static int __io_submit_sqe (struct io_ring_ctx * ctx , struct io_kiocb * req ,
10621316 const struct sqe_submit * s , bool force_nonblock ,
10631317 struct io_submit_state * state )
@@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
10931347 case IORING_OP_FSYNC :
10941348 ret = io_fsync (req , s -> sqe , force_nonblock );
10951349 break ;
1350+ case IORING_OP_POLL_ADD :
1351+ ret = io_poll_add (req , s -> sqe );
1352+ break ;
1353+ case IORING_OP_POLL_REMOVE :
1354+ ret = io_poll_remove (req , s -> sqe );
1355+ break ;
10961356 default :
10971357 ret = - EINVAL ;
10981358 break ;
@@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
21312391 percpu_ref_kill (& ctx -> refs );
21322392 mutex_unlock (& ctx -> uring_lock );
21332393
2394+ io_poll_remove_all (ctx );
21342395 io_iopoll_reap_events (ctx );
21352396 wait_for_completion (& ctx -> ctx_done );
21362397 io_ring_ctx_free (ctx );
0 commit comments