@@ -1620,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
16201620 iocb_put (iocb );
16211621}
16221622
1623+ /*
1624+ * Safely lock the waitqueue which the request is on, synchronizing with the
1625+ * case where the ->poll() provider decides to free its waitqueue early.
1626+ *
1627+ * Returns true on success, meaning that req->head->lock was locked, req->wait
1628+ * is on req->head, and an RCU read lock was taken. Returns false if the
1629+ * request was already removed from its waitqueue (which might no longer exist).
1630+ */
1631+ static bool poll_iocb_lock_wq (struct poll_iocb * req )
1632+ {
1633+ wait_queue_head_t * head ;
1634+
1635+ /*
1636+ * While we hold the waitqueue lock and the waitqueue is nonempty,
1637+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
1638+ * lock in the first place can race with the waitqueue being freed.
1639+ *
1640+ * We solve this as eventpoll does: by taking advantage of the fact that
1641+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
1642+ * we enter rcu_read_lock() and see that the pointer to the queue is
1643+ * non-NULL, we can then lock it without the memory being freed out from
1644+ * under us, then check whether the request is still on the queue.
1645+ *
1646+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
1647+ * case the caller deletes the entry from the queue, leaving it empty.
1648+ * In that case, only RCU prevents the queue memory from being freed.
1649+ */
1650+ rcu_read_lock ();
1651+ head = smp_load_acquire (& req -> head );
1652+ if (head ) {
1653+ spin_lock (& head -> lock );
1654+ if (!list_empty (& req -> wait .entry ))
1655+ return true;
1656+ spin_unlock (& head -> lock );
1657+ }
1658+ rcu_read_unlock ();
1659+ return false;
1660+ }
1661+
1662+ static void poll_iocb_unlock_wq (struct poll_iocb * req )
1663+ {
1664+ spin_unlock (& req -> head -> lock );
1665+ rcu_read_unlock ();
1666+ }
1667+
16231668static void aio_poll_complete_work (struct work_struct * work )
16241669{
16251670 struct poll_iocb * req = container_of (work , struct poll_iocb , work );
@@ -1639,24 +1684,25 @@ static void aio_poll_complete_work(struct work_struct *work)
16391684 * avoid further branches in the fast path.
16401685 */
16411686 spin_lock_irq (& ctx -> ctx_lock );
1642- spin_lock (& req -> head -> lock );
1643- if (!mask && !READ_ONCE (req -> cancelled )) {
1644- /*
1645- * The request isn't actually ready to be completed yet.
1646- * Reschedule completion if another wakeup came in.
1647- */
1648- if (req -> work_need_resched ) {
1649- schedule_work (& req -> work );
1650- req -> work_need_resched = false;
1651- } else {
1652- req -> work_scheduled = false;
1687+ if (poll_iocb_lock_wq (req )) {
1688+ if (!mask && !READ_ONCE (req -> cancelled )) {
1689+ /*
1690+ * The request isn't actually ready to be completed yet.
1691+ * Reschedule completion if another wakeup came in.
1692+ */
1693+ if (req -> work_need_resched ) {
1694+ schedule_work (& req -> work );
1695+ req -> work_need_resched = false;
1696+ } else {
1697+ req -> work_scheduled = false;
1698+ }
1699+ poll_iocb_unlock_wq (req );
1700+ spin_unlock_irq (& ctx -> ctx_lock );
1701+ return ;
16531702 }
1654- spin_unlock (& req -> head -> lock );
1655- spin_unlock_irq (& ctx -> ctx_lock );
1656- return ;
1657- }
1658- list_del_init (& req -> wait .entry );
1659- spin_unlock (& req -> head -> lock );
1703+ list_del_init (& req -> wait .entry );
1704+ poll_iocb_unlock_wq (req );
1705+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
16601706 list_del_init (& iocb -> ki_list );
16611707 iocb -> ki_res .res = mangle_poll (mask );
16621708 spin_unlock_irq (& ctx -> ctx_lock );
@@ -1670,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
16701716 struct aio_kiocb * aiocb = container_of (iocb , struct aio_kiocb , rw );
16711717 struct poll_iocb * req = & aiocb -> poll ;
16721718
1673- spin_lock (& req -> head -> lock );
1674- WRITE_ONCE (req -> cancelled , true);
1675- if (!req -> work_scheduled ) {
1676- schedule_work (& aiocb -> poll .work );
1677- req -> work_scheduled = true;
1678- }
1679- spin_unlock (& req -> head -> lock );
1719+ if (poll_iocb_lock_wq (req )) {
1720+ WRITE_ONCE (req -> cancelled , true);
1721+ if (!req -> work_scheduled ) {
1722+ schedule_work (& aiocb -> poll .work );
1723+ req -> work_scheduled = true;
1724+ }
1725+ poll_iocb_unlock_wq (req );
1726+ } /* else, the request was force-cancelled by POLLFREE already */
16801727
16811728 return 0 ;
16821729}
@@ -1728,21 +1775,45 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
17281775 *
17291776 * Don't remove the request from the waitqueue here, as it might
17301777 * not actually be complete yet (we won't know until vfs_poll()
1731- * is called), and we must not miss any wakeups.
1778+ * is called), and we must not miss any wakeups. POLLFREE is an
1779+ * exception to this; see below.
17321780 */
17331781 if (req -> work_scheduled ) {
17341782 req -> work_need_resched = true;
17351783 } else {
17361784 schedule_work (& req -> work );
17371785 req -> work_scheduled = true;
17381786 }
1787+
1788+ /*
1789+ * If the waitqueue is being freed early but we can't complete
1790+ * the request inline, we have to tear down the request as best
1791+ * we can. That means immediately removing the request from its
1792+ * waitqueue and preventing all further accesses to the
1793+ * waitqueue via the request. We also need to schedule the
1794+ * completion work (done above). Also mark the request as
1795+ * cancelled, to potentially skip an unneeded call to ->poll().
1796+ */
1797+ if (mask & POLLFREE ) {
1798+ WRITE_ONCE (req -> cancelled , true);
1799+ list_del_init (& req -> wait .entry );
1800+
1801+ /*
1802+ * Careful: this *must* be the last step, since as soon
1803+ * as req->head is NULL'ed out, the request can be
1804+ * completed and freed, since aio_poll_complete_work()
1805+ * will no longer need to take the waitqueue lock.
1806+ */
1807+ smp_store_release (& req -> head , NULL );
1808+ }
17391809 }
17401810 return 1 ;
17411811}
17421812
17431813struct aio_poll_table {
17441814 struct poll_table_struct pt ;
17451815 struct aio_kiocb * iocb ;
1816+ bool queued ;
17461817 int error ;
17471818};
17481819
@@ -1753,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
17531824 struct aio_poll_table * pt = container_of (p , struct aio_poll_table , pt );
17541825
17551826 /* multiple wait queues per file are not supported */
1756- if (unlikely (pt -> iocb -> poll . head )) {
1827+ if (unlikely (pt -> queued )) {
17571828 pt -> error = - EINVAL ;
17581829 return ;
17591830 }
17601831
1832+ pt -> queued = true;
17611833 pt -> error = 0 ;
17621834 pt -> iocb -> poll .head = head ;
17631835 add_wait_queue (head , & pt -> iocb -> poll .wait );
@@ -1789,6 +1861,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17891861 apt .pt ._qproc = aio_poll_queue_proc ;
17901862 apt .pt ._key = req -> events ;
17911863 apt .iocb = aiocb ;
1864+ apt .queued = false;
17921865 apt .error = - EINVAL ; /* same as no support for IOCB_CMD_POLL */
17931866
17941867 /* initialized the list so that we can do list_empty checks */
@@ -1797,9 +1870,10 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
17971870
17981871 mask = vfs_poll (req -> file , & apt .pt ) & req -> events ;
17991872 spin_lock_irq (& ctx -> ctx_lock );
1800- if (likely (req -> head )) {
1801- spin_lock (& req -> head -> lock );
1802- if (list_empty (& req -> wait .entry ) || req -> work_scheduled ) {
1873+ if (likely (apt .queued )) {
1874+ bool on_queue = poll_iocb_lock_wq (req );
1875+
1876+ if (!on_queue || req -> work_scheduled ) {
18031877 /*
18041878 * aio_poll_wake() already either scheduled the async
18051879 * completion work, or completed the request inline.
@@ -1815,15 +1889,16 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
18151889 } else if (cancel ) {
18161890 /* Cancel if possible (may be too late though). */
18171891 WRITE_ONCE (req -> cancelled , true);
1818- } else if (! list_empty ( & req -> wait . entry ) ) {
1892+ } else if (on_queue ) {
18191893 /*
18201894 * Actually waiting for an event, so add the request to
18211895 * active_reqs so that it can be cancelled if needed.
18221896 */
18231897 list_add_tail (& aiocb -> ki_list , & ctx -> active_reqs );
18241898 aiocb -> ki_cancel = aio_poll_cancel ;
18251899 }
1826- spin_unlock (& req -> head -> lock );
1900+ if (on_queue )
1901+ poll_iocb_unlock_wq (req );
18271902 }
18281903 if (mask ) { /* no async, we'd stolen it */
18291904 aiocb -> ki_res .res = mangle_poll (mask );
0 commit comments